commit f4696a22511178826aacb4112602cf5b663798f4 Author: dominik martinez Date: Sat May 17 13:03:45 2025 -0700 Add a handful of initial blas functions diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e7de127 --- /dev/null +++ b/.gitignore @@ -0,0 +1,17 @@ +*.FASL +*.fasl +*.lisp-temp +*.dfsl +*.pfsl +*.d64fsl +*.p64fsl +*.lx64fsl +*.lx32fsl +*.dx64fsl +*.dx32fsl +*.fx64fsl +*.fx32fsl +*.sx64fsl +*.sx32fsl +*.wx64fsl +*.wx32fsl diff --git a/cl-blas.asd b/cl-blas.asd new file mode 100644 index 0000000..a233bb1 --- /dev/null +++ b/cl-blas.asd @@ -0,0 +1,14 @@ +(asdf:defsystem #:cl-blas + :description "A Common Lisp implementation of the BLAS." + :version "0.0.1" + :author "Dominik Martinez" + :license "GPLv3" + :depends-on (#:alexandria + #:serapeum + #:lparallel + #:sb-simd) + :components ((:module "src" + :components + ((:file "package") + (:file "cl-blas") + (:file "benchmarks"))))) diff --git a/src/benchmarks.lisp b/src/benchmarks.lisp new file mode 100644 index 0000000..2281b7e --- /dev/null +++ b/src/benchmarks.lisp @@ -0,0 +1,17 @@ +(in-package :cl-blas) + +(defmacro benchmark-runtime ((iterations) &body body) + `(let* ((total-time (loop for i from 1 to ,iterations + sum (let ((start-time (get-internal-real-time))) + ,@body + (- (get-internal-real-time) start-time))))) + (float (/ total-time ,iterations 1000000)))) + +(defun benchmark-gemm (size) + (let* ((A (make-random-array `(,size ,size))) + (B (make-random-array `(,size ,size))) + (C (make-random-array `(,size ,size))) + (alpha 1.0) + (beta 1.0)) + (format t "running benchmark-mul~%") + (benchmark-runtime (5) (gemm A B C alpha beta)))) diff --git a/src/cl-blas.lisp b/src/cl-blas.lisp new file mode 100644 index 0000000..4211782 --- /dev/null +++ b/src/cl-blas.lisp @@ -0,0 +1,353 @@ +(in-package :cl-blas) + +(eval-when (:compile-toplevel :load-toplevel :execute) + (defparameter *optimize-qualities* '((speed 3) + (safety 0) + (debug 0))) + (defparameter +single-float-simd-generic+ + '((simd f32.8) + (simd-aref f32.8-aref) + (simd+ f32.8+) + (simd* f32.8*) + (simd-horizontal+ f32.8-horizontal+) + (stride 8) + (float single-float) + (0f 0.0))) + (defparameter +double-float-simd-generic+ + '((simd f64.4) + (simd-aref f64.4-aref) + (simd+ f64.4+) + (simd* f64.4*) + (simd-horizontal+ f64.4-horizontal+) + (stride 4) + (float double-float) + (0f 0.0d0))) + (defun parse-type-declarations (args type) + (mapcar (lambda (arg) + (if (= (second arg) 0) + `(,type ,(first arg)) + `((simple-array ,type ,(second arg)) ,(first arg)))) + args)) + (defun parse-body (body mapping) + (if (constantp body) + body + (mapcar + (lambda (el) + (typecase el + (cons (parse-body el mapping)) + (symbol (let* ((match (second (assoc el mapping))) + (sym (if match match el))) + sym)) + (t el))) + body))) + (defun parse-cond-clauses (type-decs) + `(and ,@(mapcar (lambda (type-dec) + `(typep ,(second type-dec) ',(first type-dec))) + type-decs)))) + +(defparameter +num-cpus+ (serapeum:count-cpus)) + +(defun random-range (min max) + (+ (random (- max min)) min)) + +(defun make-random-array (dimensions &key (min -100) (max 100) (float-type 'single-float)) + (declare ((or fixnum (cons fixnum)) dimensions) + (integer min max)) + (let ((arr (make-array dimensions :element-type float-type))) + (loop for i from 0 below (array-total-size arr) + do (setf (row-major-aref arr i) (coerce (random-range min max) float-type))) + arr)) + +(defun transpose (A) + "Transpose A of type (simple-array single-float 2)" + (declare ((simple-array single-float 2) A) + (optimize . #.*optimize-qualities*)) + (loop with m fixnum = (array-dimension A 0) + with n fixnum = (array-dimension A 1) + with B of-type (simple-array single-float 2) = (make-array `(,n ,m) :element-type 'single-float) + for i fixnum from 0 below m + do (loop for j fixnum from 0 below n + do (setf (aref B j i) (aref A i j))) + finally (return B))) + +(defmacro defblas (name args &body body) + ;; ARGS should be of the form ((NAME RANK)+). + ;; If RANK is 0, then this NAME is a scalar + (let* ((single-float-type-declarations (parse-type-declarations args 'single-float)) + (double-float-type-declarations (parse-type-declarations args 'double-float)) + (single-float-name (alexandria:symbolicate name '-single-float)) + (double-float-name (alexandria:symbolicate name '-double-float)) + (lambda-list (mapcar #'first args))) + `(progn + (defun ,single-float-name ,lambda-list + (declare (optimize . #.*optimize-qualities*) + ,@single-float-type-declarations) + ,@(parse-body body +single-float-simd-generic+)) + (defun ,double-float-name ,lambda-list + (declare (optimize . #.*optimize-qualities*) + ,@double-float-type-declarations) + ,@(parse-body body +double-float-simd-generic+)) + (defun ,name ,lambda-list + (declare (optimize . #.*optimize-qualities*)) + (cond + (,(parse-cond-clauses single-float-type-declarations) + (,single-float-name ,@lambda-list)) + (,(parse-cond-clauses double-float-type-declarations) + (,double-float-name ,@lambda-list))))))) + +;;;; Level 1 BLAS: vector, O(n) operations + +;;; axpy + +(defblas axpy ((alpha 0) (x 1) (y 1)) + (let* ((n (array-dimension x 0)) + (n-block (* (floor n stride) stride))) + (declare (fixnum n n-block)) + (loop with alpha-vec of-type simd = (simd alpha) + for i fixnum from 0 below n-block by 8 + do (setf (simd-aref y i) + (simd+ (simd-aref y i) + (simd* alpha-vec + (simd-aref x i))))) + (loop for i fixnum from n-block below n + do (setf (aref y i) (+ (aref y i) (* alpha (aref x i))))))) + +;;; scal + +(defblas scal ((alpha 0) (x 1)) + (let* ((n (array-dimension x 0)) + (n-block (* (floor n stride) stride))) + (declare (fixnum n n-block)) + (loop with alpha-vec of-type simd = (simd alpha) + for i fixnum from 0 below n-block by 8 + do (setf (simd-aref x i) + (simd* alpha-vec + (simd-aref x i)))) + (loop for i fixnum from n-block below n + do (setf (aref x i) (* alpha (aref x i)))))) + +;;; copy + +(defblas copy ((x 1) (y 1)) + (let* ((n (array-dimension x 0)) + (n-block (* (floor n stride) stride))) + (declare (fixnum n n-block)) + (loop for i fixnum from 0 below n-block by 8 + do (setf (simd-aref y i) (simd-aref x i))) + (loop for i fixnum from n-block below n + do (setf (aref y i) (aref x i))))) + +;;; swap + +(defblas swap ((x 1) (y 1)) + (let* ((n (array-dimension x 0)) + (n-block (* (floor n stride) stride))) + (declare (fixnum n n-block)) + (loop with tmp of-type simd = (simd 0) + for i fixnum from 0 below n-block by stride + do (setf tmp (simd-aref x i) + (simd-aref x i) (simd-aref y i) + (simd-aref y i) tmp)) + (loop with tmp of-type float + for i fixnum from n-block below n + do (setf tmp (aref x i) + (aref x i) (aref y i) + (aref y i) tmp)))) + +;;; dot + +(defblas dot ((x 1) (y 1))11 + (let* ((n (array-dimension x 0)) + (n-block (* (floor n stride) stride)) + (sum 0f)) + (declare (fixnum n n-block) + (float sum)) + (loop with acc of-type simd = (simd 0) + for i fixnum from 0 below n-block by stride + do (setf acc (simd+ acc (simd* (simd-aref x i) (simd-aref y i)))) + finally (setf sum (simd-horizontal+ acc))) + (loop for i fixnum from n-block below n + do (setf sum (+ sum (* (aref x i) (aref y i))))) + sum)) + +;;; nrm2 + +(defblas nrm2 ((x 1)) + (let* ((n (array-dimension x 0)) + (n-block (* (floor n stride) stride)) + (norm 0f)) + (declare (fixnum n n-block) + (float norm)) + (loop with acc of-type simd = (simd 0) + for i fixnum from 0 below n-block by stride + for sel of-type simd = (simd-aref x i) + do (setf acc (simd+ acc (simd* sel sel))) + finally (setf norm (simd-horizontal+ acc))) + (loop for i fixnum from n-block below n + do (setf norm (+ norm (expt (aref x i) 2)))) + (the float (sqrt norm)))) + +;;; asum + +(defblas asum ((x 1)) + (let* ((n (array-dimension x 0)) + (n-block (* (floor n stride) stride)) + (norm 0f)) + (declare (fixnum n n-block) + (float norm)) + (loop with acc of-type simd = (simd 0) + for i fixnum from 0 below n-block by stride + for sel of-type simd = (simd-aref x i) + do (setf acc (simd+ acc (simd* sel sel))) + finally (setf norm (simd-horizontal+ acc))) + (loop for i fixnum from n-block below n + do (setf norm (+ norm (expt (aref x i) 2)))) + norm)) + +;;;; Level 3 BLAS: matrix-matrix, O(n^3) operations + +(defun transpose-single-float (A) + (declare ((simple-array single-float 2) A) + (optimize . #.*optimize-qualities*)) + (loop with m fixnum = (array-dimension A 0) + with n fixnum = (array-dimension A 1) + with B of-type (simple-array single-float 2) = (make-array `(,n ,m) :element-type 'single-float) + for i fixnum from 0 below m + do (loop for j fixnum from 0 below n + do (setf (aref B j i) (aref A i j))) + finally (return B))) + +(defun transpose-double-float (A) + (declare ((simple-array double-float 2) A) + (optimize . #.*optimize-qualities*)) + (loop with m fixnum = (array-dimension A 0) + with n fixnum = (array-dimension A 1) + with B of-type (simple-array double-float 2) = (make-array `(,n ,m) :element-type 'double-float) + for i fixnum from 0 below m + do (loop for j fixnum from 0 below n + do (setf (aref B j i) (aref A i j))) + finally (return B))) + +;;; gemm + +(declaim (inline gemm-row-dot-single-float gemm-row-dot-double-float)) +(defun gemm-row-dot-single-float (A Bt C alpha beta i n r r-block) + (declare ((simple-array single-float 2) A Bt C) + (single-float alpha beta) + (fixnum i n r r-block) + (optimize . #.*optimize-qualities*)) + (loop with alpha-vec of-type f32.8 = (f32.8 alpha) + for j fixnum from 0 below n + do (setf (aref C i j) + (+ (* beta (aref C i j)) + (loop for acc of-type f32.8 = (f32.8 0) then (f32.8+ acc (f32.8* alpha-vec (f32.8-aref A i k) (f32.8-aref Bt j k))) + for k fixnum from 0 below r-block by 8 + finally (return (the single-float (f32.8-horizontal+ acc)))) + (loop for k fixnum from r-block below r + sum (* alpha (aref A i k) (aref Bt j k)) of-type single-float))))) + +(defun gemm-row-dot-double-float (A Bt C alpha beta i n r r-block) + (declare ((simple-array double-float 2) A Bt C) + (double-float alpha beta) + (fixnum i n r r-block) + (optimize . #.*optimize-qualities*)) + (loop with alpha-vec of-type f64.4 = (f64.4 alpha) + for j fixnum from 0 below n + do (setf (aref C i j) + (+ (* beta (aref C i j)) + (loop for acc of-type f64.4 = (f64.4 0) then (f64.4+ acc (f64.4* alpha-vec (f64.4-aref A i k) (f64.4-aref Bt j k))) + for k fixnum from 0 below r-block by 4 + finally (return (the double-float (f64.4-horizontal+ acc)))) + (loop for k fixnum from r-block below r + sum (* alpha (aref A i k) (aref Bt j k)) of-type double-float))))) + +(defun gemm-parallel-single-float (A B C alpha beta) + (declare ((simple-array single-float 2) A B C) + (single-float alpha beta) + (optimize . #.*optimize-qualities*)) + (setf lparallel:*kernel* (lparallel:make-kernel +num-cpus+)) + (let ((channel (lparallel:make-channel))) + (loop with m fixnum = (array-dimension C 0) + with n fixnum = (array-dimension C 1) + with r-block fixnum = (* (floor (array-dimension A 1) 8) 8) + with r fixnum = (array-dimension A 1) + with Bt of-type (simple-array single-float 2) = (transpose-single-float B) + for i fixnum from 0 below m + do (lparallel:submit-task channel #'gemm-row-dot-single-float A Bt C alpha beta i n r r-block)) + (loop for i from 1 to (array-dimension C 0) + do (lparallel:receive-result channel))) + (lparallel:end-kernel) + nil) + +(defun gemm-parallel-double-float (A B C alpha beta) + (declare ((simple-array double-float 2) A B C) + (double-float alpha beta) + (optimize . #.*optimize-qualities*)) + (setf lparallel:*kernel* (lparallel:make-kernel +num-cpus+)) + (let ((channel (lparallel:make-channel))) + (loop with m fixnum = (array-dimension C 0) + with n fixnum = (array-dimension C 1) + with r-block fixnum = (* (floor (array-dimension A 1) 4) 4) + with r fixnum = (array-dimension A 1) + with Bt of-type (simple-array double-float 2) = (transpose-double-float B) + for i fixnum from 0 below m + do (lparallel:submit-task channel #'gemm-row-dot-double-float A Bt C alpha beta i n r r-block)) + (loop for i from 1 to (array-dimension C 0) + do (lparallel:receive-result channel))) + (lparallel:end-kernel) + nil) + +(defun gemm-serial-single-float (A B C alpha beta) + (declare ((simple-array single-float 2) A B C) + (single-float alpha beta) + (optimize . #.*optimize-qualities*)) + (loop with m fixnum = (array-dimension C 0) + with n fixnum = (array-dimension C 1) + with r-block fixnum = (* (floor (array-dimension A 1) 8) 8) + with r fixnum = (array-dimension A 1) + with Bt of-type (simple-array single-float 2) = (transpose-single-float B) + for i fixnum from 0 below m + do (gemm-row-dot-single-float A Bt C alpha beta i n r r-block))) + +(defun gemm-serial-double-float (A B C alpha beta) + (declare ((simple-array double-float 2) A B C) + (double-float alpha beta) + (optimize . #.*optimize-qualities*)) + (loop with m fixnum = (array-dimension C 0) + with n fixnum = (array-dimension C 1) + with r-block fixnum = (* (floor (array-dimension A 1) 4) 4) + with r fixnum = (array-dimension A 1) + with Bt of-type (simple-array double-float 2) = (transpose-double-float B) + for i fixnum from 0 below m + do (gemm-row-dot-double-float A Bt C alpha beta i n r r-block))) + +(defun gemm-single-float (A B C alpha beta) + (declare ((simple-array single-float 2) A B C) + (single-float alpha beta) + (optimize (speed 3))) + (if (< (array-total-size C) 512) + (gemm-serial-single-float A B C alpha beta) + (gemm-parallel-single-float A B C alpha beta))) + +(defun gemm-double-float (A B C alpha beta) + (declare ((simple-array double-float 2) A B C) + (double-float alpha beta) + (optimize (speed 3))) + (if (< (array-total-size C) 512) + (gemm-serial-double-float A B C alpha beta) + (gemm-parallel-double-float A B C alpha beta))) + +(defun gemm (A B C alpha beta) + (cond + ((and (typep A '(simple-array single-float 2)) + (typep B '(simple-array single-float 2)) + (typep C '(simple-array single-float 2)) + (typep alpha 'single-float) + (typep beta 'single-float)) + (gemm-single-float A B C alpha beta)) + ((and (typep A '(simple-array double-float 2)) + (typep B '(simple-array double-float 2)) + (typep C '(simple-array double-float 2)) + (typep alpha 'double-float) + (typep beta 'double-float)) + (gemm-double-float A B C alpha beta)))) diff --git a/src/package.lisp b/src/package.lisp new file mode 100644 index 0000000..7811500 --- /dev/null +++ b/src/package.lisp @@ -0,0 +1,11 @@ +(defpackage :cl-blas + (:use #:cl #:sb-simd-avx) + (:export #:axpy + #:scal + #:copy + #:swap + #:dot + #:nrm2 + #:asum + #:gemm)) +