/* * PURE UNARY (BASE-1) TRANSFORMER ENGINE * AVX-512 + OpenMP. Full Qwen2 forward pass in C. * * Thermometer encoding: magnitude M = M planes set. * Each plane contributes EXACTLY 1. No powers. No binary. * 7 planes = 8 levels {0,1,2,3,4,5,6,7} * sign. * * Model format on disk (from unary_convert.py): * .sign = [out_dim * chunks] uint64 (1=negative) * .planes = [n_planes * out_dim * chunks] uint64 (thermometer) * .scales = [out_dim] float32 (per-row) * * (c) 2026 OpenTransformers Ltd / Scott Bisset */ #include #include #include #include #include #include #include #include #define HIDDEN 1536 #define INTER 8960 #define N_HEADS 12 #define N_KV_HEADS 2 #define HEAD_DIM 128 #define N_LAYERS 28 #define VOCAB 151936 #define RMS_EPS 1e-6f #define ROPE_THETA 1000000.0f #define MAX_SEQ 4096 #define GQA_RATIO (N_HEADS / N_KV_HEADS) typedef struct { uint64_t *sign_bits; /* [out_dim * chunks] */ uint64_t *mag_planes; /* [n_planes * out_dim * chunks] */ float *scales; /* [out_dim] */ float *bias; /* [out_dim] or NULL */ int out_dim, in_dim, n_planes; } UL; /* Unary Linear */ typedef struct { uint16_t *w; int od, id; } FL; /* FP16 Linear */ typedef struct { UL qp, kp, vp, op, gp, up, dp; float *in_norm, *pn_norm; float *qb, *kb, *vb; } Lay; typedef struct { uint16_t *emb; Lay lay[N_LAYERS]; float *fnorm; FL lmh; float *kc, *vc; /* KV cache */ float *h, *h2; /* hidden states */ float *sq, *sk, *sv; /* QKV scratch */ float *ao; /* attn output */ float *sg, *su, *sd; /* MLP scratch */ float *lg; /* logits */ float *as; /* attn scores */ int np; } M; /* ============================================================ * PURE UNARY MATVEC * * y[i] = scales[i] * SUM over planes p: * SUM over j where plane_p bit j is set: * sign[j]==0 ? +x[j] : -x[j] * * Each plane contributes 1. Seven planes, seven passes. * Embarrassingly parallel over output rows. * ============================================================ */ static void umv(const UL *L, const float *x, float *y) { const int od = L->out_dim, id = L->in_dim, np = L->n_planes; const int ch = (id + 63) / 64; const int idp = (id + 15) & ~15; float *xp = (float*)aligned_alloc(64, idp * sizeof(float)); memcpy(xp, x, id * sizeof(float)); if (idp > id) memset(xp + id, 0, (idp - id) * sizeof(float)); #pragma omp parallel for schedule(dynamic, 64) for (int i = 0; i < od; i++) { const uint64_t *rs = L->sign_bits + (size_t)i * ch; float tot = 0.0f; for (int p = 0; p < np; p++) { const uint64_t *pr = L->mag_planes + ((size_t)p * od + i) * ch; __m512 acc = _mm512_setzero_ps(); for (int c = 0; c < ch; c++) { uint64_t mb = pr[c], sb = rs[c]; uint64_t pos = mb & ~sb; uint64_t neg = mb & sb; for (int g = 0; g < 4; g++) { int off = c * 64 + g * 16; if (off >= idp) break; __m512 xv = _mm512_load_ps(xp + off); __mmask16 pm = (__mmask16)((pos >> (g*16)) & 0xFFFF); __mmask16 nm = (__mmask16)((neg >> (g*16)) & 0xFFFF); acc = _mm512_mask_add_ps(acc, pm, acc, xv); acc = _mm512_mask_sub_ps(acc, nm, acc, xv); } } /* PURE UNARY: each plane worth exactly 1 */ tot += _mm512_reduce_add_ps(acc); } y[i] = tot * L->scales[i]; if (L->bias) y[i] += L->bias[i]; } free(xp); } /* FP16 matvec (lm_head only) */ static void fmv(const FL *L, const float *x, float *y) { #pragma omp parallel for schedule(dynamic, 256) for (int i = 0; i < L->od; i++) { __m512 acc = _mm512_setzero_ps(); const uint16_t *row = L->w + (size_t)i * L->id; int j; for (j = 0; j + 16 <= L->id; j += 16) { __m256i h = _mm256_loadu_si256((__m256i*)(row + j)); acc = _mm512_fmadd_ps(_mm512_cvtph_ps(h), _mm512_loadu_ps(x + j), acc); } float s = _mm512_reduce_add_ps(acc); for (; j < L->id; j++) { float wf; _mm_store_ss(&wf, _mm_cvtph_ps(_mm_set1_epi16(row[j]))); s += wf * x[j]; } y[i] = s; } } /* RMSNorm */ static void rn(const float *x, const float *w, float *y, int d) { __m512 sq = _mm512_setzero_ps(); int i; for (i = 0; i+16 <= d; i += 16) { __m512 v = _mm512_loadu_ps(x+i); sq = _mm512_fmadd_ps(v, v, sq); } float ss = _mm512_reduce_add_ps(sq); for (; i < d; i++) ss += x[i]*x[i]; float r = 1.0f / sqrtf(ss/d + RMS_EPS); __m512 rv = _mm512_set1_ps(r); for (i = 0; i+16 <= d; i += 16) _mm512_storeu_ps(y+i, _mm512_mul_ps(_mm512_mul_ps( _mm512_loadu_ps(x+i), rv), _mm512_loadu_ps(w+i))); for (; i < d; i++) y[i] = x[i]*r*w[i]; } static void silu(float *x, int n) { for (int i = 0; i < n; i++) x[i] /= (1.0f + expf(-x[i])); } static void emul(const float *a, const float *b, float *c, int n) { int i; for (i = 0; i+16 <= n; i += 16) _mm512_storeu_ps(c+i, _mm512_mul_ps(_mm512_loadu_ps(a+i), _mm512_loadu_ps(b+i))); for (; i < n; i++) c[i] = a[i]*b[i]; } static void va(float *y, const float *x, int n) { int i; for (i = 0; i+16 <= n; i += 16) _mm512_storeu_ps(y+i, _mm512_add_ps(_mm512_loadu_ps(y+i), _mm512_loadu_ps(x+i))); for (; i < n; i++) y[i] += x[i]; } static void rope(float *v, int pos, int d) { for (int i = 0; i < d; i += 2) { float f = 1.0f / powf(ROPE_THETA, (float)i/d); float a = pos*f, co = cosf(a), si = sinf(a); float v0 = v[i], v1 = v[i+1]; v[i] = v0*co - v1*si; v[i+1] = v0*si + v1*co; } } static void sm(float *x, int n) { float mx = x[0]; for (int i = 1; i < n; i++) if (x[i] > mx) mx = x[i]; float s = 0; for (int i = 0; i < n; i++) { x[i] = expf(x[i]-mx); s += x[i]; } float iv = 1.0f/s; for (int i = 0; i < n; i++) x[i] *= iv; } static void etok(const M *m, int t, float *o) { const uint16_t *r = m->emb + (size_t)t * HIDDEN; int i; for (i = 0; i+16 <= HIDDEN; i += 16) _mm512_storeu_ps(o+i, _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(r+i)))); for (; i < HIDDEN; i++) _mm_store_ss(o+i, _mm_cvtph_ps(_mm_set1_epi16(r[i]))); } static float* kvp(float *c, int l, int p, int h) { return c + ((size_t)l*MAX_SEQ*N_KV_HEADS + (size_t)p*N_KV_HEADS + h)*HEAD_DIM; } static void do_attn(M *m, int l, int pos) { Lay *ly = &m->lay[l]; umv(&ly->qp, m->h2, m->sq); umv(&ly->kp, m->h2, m->sk); umv(&ly->vp, m->h2, m->sv); if (ly->qb) va(m->sq, ly->qb, N_HEADS*HEAD_DIM); if (ly->kb) va(m->sk, ly->kb, N_KV_HEADS*HEAD_DIM); if (ly->vb) va(m->sv, ly->vb, N_KV_HEADS*HEAD_DIM); for (int h = 0; h < N_HEADS; h++) rope(m->sq + h*HEAD_DIM, pos, HEAD_DIM); for (int h = 0; h < N_KV_HEADS; h++) rope(m->sk + h*HEAD_DIM, pos, HEAD_DIM); for (int h = 0; h < N_KV_HEADS; h++) { memcpy(kvp(m->kc,l,pos,h), m->sk+h*HEAD_DIM, HEAD_DIM*4); memcpy(kvp(m->vc,l,pos,h), m->sv+h*HEAD_DIM, HEAD_DIM*4); } float sc = 1.0f/sqrtf((float)HEAD_DIM); memset(m->ao, 0, N_HEADS*HEAD_DIM*4); for (int h = 0; h < N_HEADS; h++) { int kvh = h / GQA_RATIO; float *qh = m->sq + h*HEAD_DIM, *oh = m->ao + h*HEAD_DIM; for (int t = 0; t <= pos; t++) { float *kk = kvp(m->kc,l,t,kvh); __m512 a = _mm512_setzero_ps(); int d; for (d = 0; d+16 <= HEAD_DIM; d += 16) a = _mm512_fmadd_ps(_mm512_loadu_ps(qh+d), _mm512_loadu_ps(kk+d), a); float dot = _mm512_reduce_add_ps(a); for (; d < HEAD_DIM; d++) dot += qh[d]*kk[d]; m->as[t] = dot * sc; } sm(m->as, pos+1); for (int t = 0; t <= pos; t++) { float w = m->as[t]; if (w < 1e-8f) continue; float *vv = kvp(m->vc,l,t,kvh); __m512 wv = _mm512_set1_ps(w); int d; for (d = 0; d+16 <= HEAD_DIM; d += 16) _mm512_storeu_ps(oh+d, _mm512_fmadd_ps(wv, _mm512_loadu_ps(vv+d), _mm512_loadu_ps(oh+d))); for (; d < HEAD_DIM; d++) oh[d] += w*vv[d]; } } umv(&ly->op, m->ao, m->h2); } static void do_mlp(M *m, int l) { Lay *ly = &m->lay[l]; umv(&ly->gp, m->h2, m->sg); umv(&ly->up, m->h2, m->su); silu(m->sg, INTER); emul(m->sg, m->su, m->sd, INTER); umv(&ly->dp, m->sd, m->h2); } float* forward_token(M *m, int tid, int pos) { etok(m, tid, m->h); for (int l = 0; l < N_LAYERS; l++) { rn(m->h, m->lay[l].in_norm, m->h2, HIDDEN); do_attn(m, l, pos); va(m->h, m->h2, HIDDEN); rn(m->h, m->lay[l].pn_norm, m->h2, HIDDEN); do_mlp(m, l); va(m->h, m->h2, HIDDEN); } rn(m->h, m->fnorm, m->h2, HIDDEN); fmv(&m->lmh, m->h2, m->lg); return m->lg; } static int samp(float *lg, int V, float T, float tp) { if (T > 0) { float it = 1.0f/T; for (int i = 0; i < V; i++) lg[i] *= it; } sm(lg, V); float *pr = (float*)malloc(V*4); int *ix = (int*)malloc(V*4); memcpy(pr, lg, V*4); for (int i = 0; i < V; i++) ix[i] = i; float cum = 0; int nk = 0; while (cum < tp && nk < V && nk < 50) { int b = nk; for (int i = nk+1; i < V; i++) if (pr[i] > pr[b]) b = i; float t = pr[nk]; pr[nk] = pr[b]; pr[b] = t; int ti = ix[nk]; ix[nk] = ix[b]; ix[b] = ti; cum += pr[nk]; nk++; } float s = 0; for (int i = 0; i < nk; i++) s += pr[i]; float r = (float)rand()/RAND_MAX * s, ac = 0; int ch = ix[0]; for (int i = 0; i < nk; i++) { ac += pr[i]; if (ac >= r) { ch = ix[i]; break; } } free(pr); free(ix); return ch; } int generate(M *m, const int *pr, int pl, int *out, int mx, float T, float tp, int eos) { srand(time(NULL)); for (int i = 0; i < pl; i++) forward_token(m, pr[i], i); int pos = pl, gen = 0; for (int t = 0; t < mx; t++) { int nx; if (T <= 0) { nx = 0; for (int i = 1; i < VOCAB; i++) if (m->lg[i] > m->lg[nx]) nx = i; } else { nx = samp(m->lg, VOCAB, T, tp); } out[t] = nx; gen++; if (nx == eos) break; forward_token(m, nx, pos); pos++; } return gen; } /* ============================================================ * ALLOCATION + WEIGHT SETTERS (called from Python) * ============================================================ */ M* model_alloc(int np) { M *m = (M*)calloc(1, sizeof(M)); m->np = np; size_t kv = (size_t)N_LAYERS*MAX_SEQ*N_KV_HEADS*HEAD_DIM; m->kc = (float*)calloc(kv,4); m->vc = (float*)calloc(kv,4); m->h = (float*)aligned_alloc(64,HIDDEN*4); m->h2 = (float*)aligned_alloc(64,HIDDEN*4); m->sq = (float*)aligned_alloc(64,N_HEADS*HEAD_DIM*4); m->sk = (float*)aligned_alloc(64,N_KV_HEADS*HEAD_DIM*4); m->sv = (float*)aligned_alloc(64,N_KV_HEADS*HEAD_DIM*4); m->ao = (float*)aligned_alloc(64,N_HEADS*HEAD_DIM*4); m->sg = (float*)aligned_alloc(64,INTER*4); m->su = (float*)aligned_alloc(64,INTER*4); m->sd = (float*)aligned_alloc(64,INTER*4); m->lg = (float*)aligned_alloc(64,VOCAB*4); m->as = (float*)aligned_alloc(64,MAX_SEQ*4); m->fnorm = (float*)aligned_alloc(64,HIDDEN*4); printf("Alloc: KV=%zuMB np=%d\n", kv*2*4/1024/1024, np); return m; } void model_set_embed(M *m, uint16_t *d) { m->emb = d; } void model_set_final_norm(M *m, float *d) { memcpy(m->fnorm, d, HIDDEN*4); } void model_set_lm_head(M *m, uint16_t *d, int o, int i) { m->lmh.w = d; m->lmh.od = o; m->lmh.id = i; } void layer_set_norms(M *m, int l, float *i, float *p) { m->lay[l].in_norm = i; m->lay[l].pn_norm = p; } void layer_set_bias(M *m, int l, float *q, float *k, float *v) { m->lay[l].qb = q; m->lay[l].kb = k; m->lay[l].vb = v; } static void set_ul(UL *u, uint64_t *s, uint64_t *p, float *sc, int o, int i, int np) { u->sign_bits=s; u->mag_planes=p; u->scales=sc; u->out_dim=o; u->in_dim=i; u->n_planes=np; u->bias=NULL; } void layer_set_linears(M *m, int l, uint64_t*qs,uint64_t*qp,float*qc,int qo,int qi, uint64_t*ks,uint64_t*kp,float*kc,int ko,int ki, uint64_t*vs,uint64_t*vp,float*vc,int vo,int vi, uint64_t*os,uint64_t*op,float*oc,int oo,int oi, uint64_t*gs,uint64_t*gp,float*gc,int go,int gi, uint64_t*us,uint64_t*up,float*uc,int uo,int ui, uint64_t*ds,uint64_t*dp,float*dc,int doo,int di, int np) { set_ul(&m->lay[l].qp,qs,qp,qc,qo,qi,np); set_ul(&m->lay[l].kp,ks,kp,kc,ko,ki,np); set_ul(&m->lay[l].vp,vs,vp,vc,vo,vi,np); set_ul(&m->lay[l].op,os,op,oc,oo,oi,np); set_ul(&m->lay[l].gp,gs,gp,gc,go,gi,np); set_ul(&m->lay[l].up,us,up,uc,uo,ui,np); set_ul(&m->lay[l].dp,ds,dp,dc,doo,di,np); } void model_reset_cache(M *m) { size_t kv=(size_t)N_LAYERS*MAX_SEQ*N_KV_HEADS*HEAD_DIM; memset(m->kc,0,kv*4); memset(m->vc,0,kv*4); } void model_free(M *m) { free(m->kc);free(m->vc);free(m->h);free(m->h2); free(m->sq);free(m->sk);free(m->sv);free(m->ao); free(m->sg);free(m->su);free(m->sd); free(m->lg);free(m->as);free(m->fnorm);free(m); }