Add a handful of initial blas functions

This commit is contained in:
dominik martinez 2025-05-17 13:03:45 -07:00
commit f4696a2251
5 changed files with 412 additions and 0 deletions

17
.gitignore vendored Normal file
View file

@ -0,0 +1,17 @@
*.FASL
*.fasl
*.lisp-temp
*.dfsl
*.pfsl
*.d64fsl
*.p64fsl
*.lx64fsl
*.lx32fsl
*.dx64fsl
*.dx32fsl
*.fx64fsl
*.fx32fsl
*.sx64fsl
*.sx32fsl
*.wx64fsl
*.wx32fsl

14
cl-blas.asd Normal file
View file

@ -0,0 +1,14 @@
(asdf:defsystem #:cl-blas
:description "A Common Lisp implementation of the BLAS."
:version "0.0.1"
:author "Dominik Martinez"
:license "GPLv3"
:depends-on (#:alexandria
#:serapeum
#:lparallel
#:sb-simd)
:components ((:module "src"
:components
((:file "package")
(:file "cl-blas")
(:file "benchmarks")))))

17
src/benchmarks.lisp Normal file
View file

@ -0,0 +1,17 @@
(in-package :cl-blas)
(defmacro benchmark-runtime ((iterations) &body body)
`(let* ((total-time (loop for i from 1 to ,iterations
sum (let ((start-time (get-internal-real-time)))
,@body
(- (get-internal-real-time) start-time)))))
(float (/ total-time ,iterations 1000000))))
(defun benchmark-gemm (size)
(let* ((A (make-random-array `(,size ,size)))
(B (make-random-array `(,size ,size)))
(C (make-random-array `(,size ,size)))
(alpha 1.0)
(beta 1.0))
(format t "running benchmark-mul~%")
(benchmark-runtime (5) (gemm A B C alpha beta))))

353
src/cl-blas.lisp Normal file
View file

@ -0,0 +1,353 @@
(in-package :cl-blas)
(eval-when (:compile-toplevel :load-toplevel :execute)
(defparameter *optimize-qualities* '((speed 3)
(safety 0)
(debug 0)))
(defparameter +single-float-simd-generic+
'((simd f32.8)
(simd-aref f32.8-aref)
(simd+ f32.8+)
(simd* f32.8*)
(simd-horizontal+ f32.8-horizontal+)
(stride 8)
(float single-float)
(0f 0.0)))
(defparameter +double-float-simd-generic+
'((simd f64.4)
(simd-aref f64.4-aref)
(simd+ f64.4+)
(simd* f64.4*)
(simd-horizontal+ f64.4-horizontal+)
(stride 4)
(float double-float)
(0f 0.0d0)))
(defun parse-type-declarations (args type)
(mapcar (lambda (arg)
(if (= (second arg) 0)
`(,type ,(first arg))
`((simple-array ,type ,(second arg)) ,(first arg))))
args))
(defun parse-body (body mapping)
(if (constantp body)
body
(mapcar
(lambda (el)
(typecase el
(cons (parse-body el mapping))
(symbol (let* ((match (second (assoc el mapping)))
(sym (if match match el)))
sym))
(t el)))
body)))
(defun parse-cond-clauses (type-decs)
`(and ,@(mapcar (lambda (type-dec)
`(typep ,(second type-dec) ',(first type-dec)))
type-decs))))
(defparameter +num-cpus+ (serapeum:count-cpus))
(defun random-range (min max)
(+ (random (- max min)) min))
(defun make-random-array (dimensions &key (min -100) (max 100) (float-type 'single-float))
(declare ((or fixnum (cons fixnum)) dimensions)
(integer min max))
(let ((arr (make-array dimensions :element-type float-type)))
(loop for i from 0 below (array-total-size arr)
do (setf (row-major-aref arr i) (coerce (random-range min max) float-type)))
arr))
(defun transpose (A)
"Transpose A of type (simple-array single-float 2)"
(declare ((simple-array single-float 2) A)
(optimize . #.*optimize-qualities*))
(loop with m fixnum = (array-dimension A 0)
with n fixnum = (array-dimension A 1)
with B of-type (simple-array single-float 2) = (make-array `(,n ,m) :element-type 'single-float)
for i fixnum from 0 below m
do (loop for j fixnum from 0 below n
do (setf (aref B j i) (aref A i j)))
finally (return B)))
(defmacro defblas (name args &body body)
;; ARGS should be of the form ((NAME RANK)+).
;; If RANK is 0, then this NAME is a scalar
(let* ((single-float-type-declarations (parse-type-declarations args 'single-float))
(double-float-type-declarations (parse-type-declarations args 'double-float))
(single-float-name (alexandria:symbolicate name '-single-float))
(double-float-name (alexandria:symbolicate name '-double-float))
(lambda-list (mapcar #'first args)))
`(progn
(defun ,single-float-name ,lambda-list
(declare (optimize . #.*optimize-qualities*)
,@single-float-type-declarations)
,@(parse-body body +single-float-simd-generic+))
(defun ,double-float-name ,lambda-list
(declare (optimize . #.*optimize-qualities*)
,@double-float-type-declarations)
,@(parse-body body +double-float-simd-generic+))
(defun ,name ,lambda-list
(declare (optimize . #.*optimize-qualities*))
(cond
(,(parse-cond-clauses single-float-type-declarations)
(,single-float-name ,@lambda-list))
(,(parse-cond-clauses double-float-type-declarations)
(,double-float-name ,@lambda-list)))))))
;;;; Level 1 BLAS: vector, O(n) operations
;;; axpy
(defblas axpy ((alpha 0) (x 1) (y 1))
(let* ((n (array-dimension x 0))
(n-block (* (floor n stride) stride)))
(declare (fixnum n n-block))
(loop with alpha-vec of-type simd = (simd alpha)
for i fixnum from 0 below n-block by 8
do (setf (simd-aref y i)
(simd+ (simd-aref y i)
(simd* alpha-vec
(simd-aref x i)))))
(loop for i fixnum from n-block below n
do (setf (aref y i) (+ (aref y i) (* alpha (aref x i)))))))
;;; scal
(defblas scal ((alpha 0) (x 1))
(let* ((n (array-dimension x 0))
(n-block (* (floor n stride) stride)))
(declare (fixnum n n-block))
(loop with alpha-vec of-type simd = (simd alpha)
for i fixnum from 0 below n-block by 8
do (setf (simd-aref x i)
(simd* alpha-vec
(simd-aref x i))))
(loop for i fixnum from n-block below n
do (setf (aref x i) (* alpha (aref x i))))))
;;; copy
(defblas copy ((x 1) (y 1))
(let* ((n (array-dimension x 0))
(n-block (* (floor n stride) stride)))
(declare (fixnum n n-block))
(loop for i fixnum from 0 below n-block by 8
do (setf (simd-aref y i) (simd-aref x i)))
(loop for i fixnum from n-block below n
do (setf (aref y i) (aref x i)))))
;;; swap
(defblas swap ((x 1) (y 1))
(let* ((n (array-dimension x 0))
(n-block (* (floor n stride) stride)))
(declare (fixnum n n-block))
(loop with tmp of-type simd = (simd 0)
for i fixnum from 0 below n-block by stride
do (setf tmp (simd-aref x i)
(simd-aref x i) (simd-aref y i)
(simd-aref y i) tmp))
(loop with tmp of-type float
for i fixnum from n-block below n
do (setf tmp (aref x i)
(aref x i) (aref y i)
(aref y i) tmp))))
;;; dot
(defblas dot ((x 1) (y 1))11
(let* ((n (array-dimension x 0))
(n-block (* (floor n stride) stride))
(sum 0f))
(declare (fixnum n n-block)
(float sum))
(loop with acc of-type simd = (simd 0)
for i fixnum from 0 below n-block by stride
do (setf acc (simd+ acc (simd* (simd-aref x i) (simd-aref y i))))
finally (setf sum (simd-horizontal+ acc)))
(loop for i fixnum from n-block below n
do (setf sum (+ sum (* (aref x i) (aref y i)))))
sum))
;;; nrm2
(defblas nrm2 ((x 1))
(let* ((n (array-dimension x 0))
(n-block (* (floor n stride) stride))
(norm 0f))
(declare (fixnum n n-block)
(float norm))
(loop with acc of-type simd = (simd 0)
for i fixnum from 0 below n-block by stride
for sel of-type simd = (simd-aref x i)
do (setf acc (simd+ acc (simd* sel sel)))
finally (setf norm (simd-horizontal+ acc)))
(loop for i fixnum from n-block below n
do (setf norm (+ norm (expt (aref x i) 2))))
(the float (sqrt norm))))
;;; asum
(defblas asum ((x 1))
(let* ((n (array-dimension x 0))
(n-block (* (floor n stride) stride))
(norm 0f))
(declare (fixnum n n-block)
(float norm))
(loop with acc of-type simd = (simd 0)
for i fixnum from 0 below n-block by stride
for sel of-type simd = (simd-aref x i)
do (setf acc (simd+ acc (simd* sel sel)))
finally (setf norm (simd-horizontal+ acc)))
(loop for i fixnum from n-block below n
do (setf norm (+ norm (expt (aref x i) 2))))
norm))
;;;; Level 3 BLAS: matrix-matrix, O(n^3) operations
(defun transpose-single-float (A)
(declare ((simple-array single-float 2) A)
(optimize . #.*optimize-qualities*))
(loop with m fixnum = (array-dimension A 0)
with n fixnum = (array-dimension A 1)
with B of-type (simple-array single-float 2) = (make-array `(,n ,m) :element-type 'single-float)
for i fixnum from 0 below m
do (loop for j fixnum from 0 below n
do (setf (aref B j i) (aref A i j)))
finally (return B)))
(defun transpose-double-float (A)
(declare ((simple-array double-float 2) A)
(optimize . #.*optimize-qualities*))
(loop with m fixnum = (array-dimension A 0)
with n fixnum = (array-dimension A 1)
with B of-type (simple-array double-float 2) = (make-array `(,n ,m) :element-type 'double-float)
for i fixnum from 0 below m
do (loop for j fixnum from 0 below n
do (setf (aref B j i) (aref A i j)))
finally (return B)))
;;; gemm
(declaim (inline gemm-row-dot-single-float gemm-row-dot-double-float))
(defun gemm-row-dot-single-float (A Bt C alpha beta i n r r-block)
(declare ((simple-array single-float 2) A Bt C)
(single-float alpha beta)
(fixnum i n r r-block)
(optimize . #.*optimize-qualities*))
(loop with alpha-vec of-type f32.8 = (f32.8 alpha)
for j fixnum from 0 below n
do (setf (aref C i j)
(+ (* beta (aref C i j))
(loop for acc of-type f32.8 = (f32.8 0) then (f32.8+ acc (f32.8* alpha-vec (f32.8-aref A i k) (f32.8-aref Bt j k)))
for k fixnum from 0 below r-block by 8
finally (return (the single-float (f32.8-horizontal+ acc))))
(loop for k fixnum from r-block below r
sum (* alpha (aref A i k) (aref Bt j k)) of-type single-float)))))
(defun gemm-row-dot-double-float (A Bt C alpha beta i n r r-block)
(declare ((simple-array double-float 2) A Bt C)
(double-float alpha beta)
(fixnum i n r r-block)
(optimize . #.*optimize-qualities*))
(loop with alpha-vec of-type f64.4 = (f64.4 alpha)
for j fixnum from 0 below n
do (setf (aref C i j)
(+ (* beta (aref C i j))
(loop for acc of-type f64.4 = (f64.4 0) then (f64.4+ acc (f64.4* alpha-vec (f64.4-aref A i k) (f64.4-aref Bt j k)))
for k fixnum from 0 below r-block by 4
finally (return (the double-float (f64.4-horizontal+ acc))))
(loop for k fixnum from r-block below r
sum (* alpha (aref A i k) (aref Bt j k)) of-type double-float)))))
(defun gemm-parallel-single-float (A B C alpha beta)
(declare ((simple-array single-float 2) A B C)
(single-float alpha beta)
(optimize . #.*optimize-qualities*))
(setf lparallel:*kernel* (lparallel:make-kernel +num-cpus+))
(let ((channel (lparallel:make-channel)))
(loop with m fixnum = (array-dimension C 0)
with n fixnum = (array-dimension C 1)
with r-block fixnum = (* (floor (array-dimension A 1) 8) 8)
with r fixnum = (array-dimension A 1)
with Bt of-type (simple-array single-float 2) = (transpose-single-float B)
for i fixnum from 0 below m
do (lparallel:submit-task channel #'gemm-row-dot-single-float A Bt C alpha beta i n r r-block))
(loop for i from 1 to (array-dimension C 0)
do (lparallel:receive-result channel)))
(lparallel:end-kernel)
nil)
(defun gemm-parallel-double-float (A B C alpha beta)
(declare ((simple-array double-float 2) A B C)
(double-float alpha beta)
(optimize . #.*optimize-qualities*))
(setf lparallel:*kernel* (lparallel:make-kernel +num-cpus+))
(let ((channel (lparallel:make-channel)))
(loop with m fixnum = (array-dimension C 0)
with n fixnum = (array-dimension C 1)
with r-block fixnum = (* (floor (array-dimension A 1) 4) 4)
with r fixnum = (array-dimension A 1)
with Bt of-type (simple-array double-float 2) = (transpose-double-float B)
for i fixnum from 0 below m
do (lparallel:submit-task channel #'gemm-row-dot-double-float A Bt C alpha beta i n r r-block))
(loop for i from 1 to (array-dimension C 0)
do (lparallel:receive-result channel)))
(lparallel:end-kernel)
nil)
(defun gemm-serial-single-float (A B C alpha beta)
(declare ((simple-array single-float 2) A B C)
(single-float alpha beta)
(optimize . #.*optimize-qualities*))
(loop with m fixnum = (array-dimension C 0)
with n fixnum = (array-dimension C 1)
with r-block fixnum = (* (floor (array-dimension A 1) 8) 8)
with r fixnum = (array-dimension A 1)
with Bt of-type (simple-array single-float 2) = (transpose-single-float B)
for i fixnum from 0 below m
do (gemm-row-dot-single-float A Bt C alpha beta i n r r-block)))
(defun gemm-serial-double-float (A B C alpha beta)
(declare ((simple-array double-float 2) A B C)
(double-float alpha beta)
(optimize . #.*optimize-qualities*))
(loop with m fixnum = (array-dimension C 0)
with n fixnum = (array-dimension C 1)
with r-block fixnum = (* (floor (array-dimension A 1) 4) 4)
with r fixnum = (array-dimension A 1)
with Bt of-type (simple-array double-float 2) = (transpose-double-float B)
for i fixnum from 0 below m
do (gemm-row-dot-double-float A Bt C alpha beta i n r r-block)))
(defun gemm-single-float (A B C alpha beta)
(declare ((simple-array single-float 2) A B C)
(single-float alpha beta)
(optimize (speed 3)))
(if (< (array-total-size C) 512)
(gemm-serial-single-float A B C alpha beta)
(gemm-parallel-single-float A B C alpha beta)))
(defun gemm-double-float (A B C alpha beta)
(declare ((simple-array double-float 2) A B C)
(double-float alpha beta)
(optimize (speed 3)))
(if (< (array-total-size C) 512)
(gemm-serial-double-float A B C alpha beta)
(gemm-parallel-double-float A B C alpha beta)))
(defun gemm (A B C alpha beta)
(cond
((and (typep A '(simple-array single-float 2))
(typep B '(simple-array single-float 2))
(typep C '(simple-array single-float 2))
(typep alpha 'single-float)
(typep beta 'single-float))
(gemm-single-float A B C alpha beta))
((and (typep A '(simple-array double-float 2))
(typep B '(simple-array double-float 2))
(typep C '(simple-array double-float 2))
(typep alpha 'double-float)
(typep beta 'double-float))
(gemm-double-float A B C alpha beta))))

11
src/package.lisp Normal file
View file

@ -0,0 +1,11 @@
(defpackage :cl-blas
(:use #:cl #:sb-simd-avx)
(:export #:axpy
#:scal
#:copy
#:swap
#:dot
#:nrm2
#:asum
#:gemm))