diff --git a/native/Makefile b/native/Makefile index 2d63753..1d4c402 100644 --- a/native/Makefile +++ b/native/Makefile @@ -2,6 +2,9 @@ all: mine override CFLAGS += -march=native -mtune=native -O3 -flto -Wall -Wextra -Wpedantic -Wno-deprecated-declarations -Wno-format +NVCC ?= nvcc +NVCCFLAGS ?= -O3 -arch=sm_75 --ptxas-options=-v -DUSE_BITMAP_FILTER + # make mine_nogmp CFLAGS=-DBENCHMARK mine: mine.c util.h @@ -9,3 +12,6 @@ mine: mine.c util.h mine_nogmp: mine_nogmp.c util.h bigint.h $(CC) mine_nogmp.c -o mine_nogmp $(CFLAGS) -lcrypto + +mine_cuda: mine_cuda.cu util.h bigint.h + $(NVCC) mine_cuda.cu -o mine_cuda $(NVCCFLAGS) diff --git a/native/mine_cuda b/native/mine_cuda new file mode 100755 index 0000000..99850c0 Binary files /dev/null and b/native/mine_cuda differ diff --git a/native/mine_cuda.cu b/native/mine_cuda.cu new file mode 100644 index 0000000..fc22816 --- /dev/null +++ b/native/mine_cuda.cu @@ -0,0 +1,590 @@ +// mine_cuda.cu - CUDA port of mine_nogmp.c +// Based on the OpenCL port (ocl_mine.cl / ocl_mine.py) +// +// USAGE: ./mine_cuda precomputed.bin did:key: [prefix2 ...] +// (thread count arg is accepted for CLI compatibility but ignored - GPU config is compile-time) + +#include +#include +#include +#include +#include +#include +#include +#include + +// ---- tuning knobs ---- +#ifndef STEPS_PER_TASK +#define STEPS_PER_TASK 512 +#endif +#ifndef MAX_RESULTS +#define MAX_RESULTS 64 +#endif +#ifndef WORK_SIZE +#define WORK_SIZE (1 << 20) // handles per kernel call +#endif +#ifndef BLOCK_SIZE +#define BLOCK_SIZE 256 +#endif + +// ---- fixed offsets (matching mine_nogmp.c and ocl_mine.py) ---- +#define PRESIGNED_LEN 155 +#define PRESIGNED_HANDLE_OFF 55 +#define PRESIGNED_PUBKEY_OFF 77 + +#define SIGNED_LEN 247 +#define SIGNED_SIG_OFF 7 +#define SIGNED_HANDLE_OFF 147 +#define SIGNED_PUBKEY_OFF 169 + +#define RESULT_STRIDE 68 // handle(6)+pad(2)+row(4)+k_inv(32)+did_b32(24) + +#define MAX_PREFIXES 16 +#define MAX_PREFIX_DATA 128 +#define MASK26 0x03FFFFFFu + +// ---- constant memory ---- +__constant__ uint8_t d_presigned_tpl[PRESIGNED_LEN]; +__constant__ uint8_t d_signed_tpl[SIGNED_LEN]; +__constant__ uint32_t d_firstbyte_bitmap[8]; // 256-bit bitmap for first-byte filter +__constant__ uint8_t d_firstbytes[MAX_PREFIXES]; +__constant__ uint8_t d_prefix_data[MAX_PREFIX_DATA]; +__constant__ uint32_t d_prefix_lens[MAX_PREFIXES]; +__constant__ uint32_t d_prefix_offsets[MAX_PREFIXES]; +__constant__ uint32_t d_num_prefixes; + +// ---- SHA-256 ---- +__constant__ uint32_t K_SHA[64] = { + 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5, + 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174, + 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da, + 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967, + 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85, + 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,0xd192e819,0xd6990624,0xf40e3585,0x106aa070, + 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3, + 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +}; + +static __device__ __forceinline__ uint32_t rotr32(uint32_t x, int n) { + return (x >> n) | (x << (32 - n)); +} +#define CH(x,y,z) ((z)^((x)&((y)^(z)))) +#define MAJ(x,y,z) (((x)&(y))|((z)&((x)|(y)))) +#define EP0(x) (rotr32(x,2)^rotr32(x,13)^rotr32(x,22)) +#define EP1(x) (rotr32(x,6)^rotr32(x,11)^rotr32(x,25)) +#define SIG0(x) (rotr32(x,7)^rotr32(x,18)^((x)>>3)) +#define SIG1(x) (rotr32(x,17)^rotr32(x,19)^((x)>>10)) + +static __device__ void sha256_compress(uint32_t state[8], uint32_t blk[16]) { + uint32_t a=state[0],b=state[1],c=state[2],d=state[3]; + uint32_t e=state[4],f=state[5],g=state[6],h=state[7]; + #pragma unroll + for (int i=0;i<16;i++){ + uint32_t t1=h+EP1(e)+CH(e,f,g)+K_SHA[i]+blk[i]; + uint32_t t2=EP0(a)+MAJ(a,b,c); + h=g;g=f;f=e;e=d+t1;d=c;c=b;b=a;a=t1+t2; + } + #pragma unroll + for (int i=16;i<64;i++){ + blk[i&15]=SIG1(blk[(i-2)&15])+blk[(i-7)&15]+SIG0(blk[(i-15)&15])+blk[(i-16)&15]; + uint32_t t1=h+EP1(e)+CH(e,f,g)+K_SHA[i]+blk[i&15]; + uint32_t t2=EP0(a)+MAJ(a,b,c); + h=g;g=f;f=e;e=d+t1;d=c;c=b;b=a;a=t1+t2; + } + state[0]+=a;state[1]+=b;state[2]+=c;state[3]+=d; + state[4]+=e;state[5]+=f;state[6]+=g;state[7]+=h; +} + +static __device__ void sha256_buf(const uint8_t *data, uint32_t len, uint32_t out[8]) { + const uint32_t INIT[8] = { + 0x6a09e667,0xbb67ae85,0x3c6ef372,0xa54ff53a, + 0x510e527f,0x9b05688c,0x1f83d9ab,0x5be0cd19 + }; + uint32_t state[8]; + #pragma unroll + for (int i=0;i<8;i++) state[i]=INIT[i]; + + uint32_t blk[16]; + uint32_t pos=0; + + while (pos+64<=len) { + #pragma unroll + for (int w=0;w<16;w++){ + uint32_t b=pos+w*4; + blk[w]=((uint32_t)data[b]<<24)|((uint32_t)data[b+1]<<16) + |((uint32_t)data[b+2]<<8 )|((uint32_t)data[b+3]); + } + sha256_compress(state,blk); + pos+=64; + } + + uint32_t rem=len-pos; + #pragma unroll + for (int w=0;w<16;w++) blk[w]=0; + for (uint32_t b=0;b>2, sh=24-((b&3)<<3); + blk[w]|=((uint32_t)data[pos+b])<>2,sh=24-((b&3)<<3);blk[w]|=(uint32_t)0x80u<>26; t[9]&=MASK26; + #pragma unroll + for (int i=10;i<19;i++){t[i+1]+=t[i]>>26;hi[i-10]=(uint32_t)((t[i]&MASK26)<<4);t[i]=0;} + hi[9]=(uint32_t)(t[19]<<4); + for (int i=0;i<10;i++) + #pragma unroll + for (int j=0;j<5;j++) + t[i+j]+=(uint64_t)hi[i]*(uint64_t)C_LIMBS[j]; + + t[10]+=t[9]>>26; t[9]&=MASK26; + #pragma unroll + for (int i=10;i<14;i++){t[i+1]+=t[i]>>26;hi[i-10]=(uint32_t)((t[i]&MASK26)<<4);t[i]=0;} + hi[4]=(uint32_t)(t[14]<<4); + for (int i=0;i<5;i++) + #pragma unroll + for (int j=0;j<5;j++) + t[i+j]+=(uint64_t)hi[i]*(uint64_t)C_LIMBS[j]; + + #pragma unroll + for (int i=0;i<10;i++) t[i]+=c[i]; + #pragma unroll + for (int i=0;i<10;i++){t[i+1]+=t[i]>>26;t[i]&=MASK26;} + + uint32_t ov=(uint32_t)((t[9]>>22)+(t[10]<<4)); + #pragma unroll + for (int j=0;j<5;j++) t[j]+=(uint64_t)ov*(uint64_t)C_LIMBS[j]; + #pragma unroll + for (int i=0;i<10;i++){t[i+1]+=t[i]>>26;t[i]&=MASK26;} + + #pragma unroll + for (int i=0;i<10;i++) res[i]=(uint32_t)t[i]; + + if (res[9]&(1u<<21)){ + #pragma unroll + for (int i=0;i<10;i++) res[i]=N_LIMBS[i]-res[i]; + #pragma unroll + for (int i=0;i<9;i++){res[i+1]-=res[i]>>31;res[i]&=MASK26;} + } +} + +static __host__ __device__ void bigint_unpack(uint32_t res[10], const uint8_t buf[32]) { + res[0]=((uint32_t)buf[31])|((uint32_t)buf[30]<<8)|((uint32_t)buf[29]<<16)|((uint32_t)(buf[28]&0x03)<<24);res[0]&=MASK26; + res[1]=((uint32_t)(buf[28]>>2))|((uint32_t)buf[27]<<6)|((uint32_t)buf[26]<<14)|((uint32_t)(buf[25]&0x0F)<<22);res[1]&=MASK26; + res[2]=((uint32_t)(buf[25]>>4))|((uint32_t)buf[24]<<4)|((uint32_t)buf[23]<<12)|((uint32_t)(buf[22]&0x3F)<<20);res[2]&=MASK26; + res[3]=((uint32_t)(buf[22]>>6))|((uint32_t)buf[21]<<2)|((uint32_t)buf[20]<<10)|((uint32_t)buf[19]<<18);res[3]&=MASK26; + res[4]=((uint32_t)buf[18])|((uint32_t)buf[17]<<8)|((uint32_t)buf[16]<<16)|((uint32_t)(buf[15]&0x03)<<24);res[4]&=MASK26; + res[5]=((uint32_t)(buf[15]>>2))|((uint32_t)buf[14]<<6)|((uint32_t)buf[13]<<14)|((uint32_t)(buf[12]&0x0F)<<22);res[5]&=MASK26; + res[6]=((uint32_t)(buf[12]>>4))|((uint32_t)buf[11]<<4)|((uint32_t)buf[10]<<12)|((uint32_t)(buf[9]&0x3F)<<20);res[6]&=MASK26; + res[7]=((uint32_t)(buf[9]>>6))|((uint32_t)buf[8]<<2)|((uint32_t)buf[7]<<10)|((uint32_t)buf[6]<<18);res[7]&=MASK26; + res[8]=((uint32_t)buf[5])|((uint32_t)buf[4]<<8)|((uint32_t)buf[3]<<16)|((uint32_t)(buf[2]&0x03)<<24);res[8]&=MASK26; + res[9]=((uint32_t)(buf[2]>>2))|((uint32_t)buf[1]<<6)|((uint32_t)buf[0]<<14);res[9]&=0x003FFFFFu; +} + +static __device__ void bigint_pack(uint8_t buf[32], const uint32_t b[10]) { + buf[31]=(b[0]>>0)&0xFF;buf[30]=(b[0]>>8)&0xFF;buf[29]=(b[0]>>16)&0xFF;buf[28]=(b[0]>>24)&0x03; + buf[28]|=(b[1]&0x3F)<<2;buf[27]=(b[1]>>6)&0xFF;buf[26]=(b[1]>>14)&0xFF;buf[25]=(b[1]>>22)&0x0F; + buf[25]|=(b[2]&0x0F)<<4;buf[24]=(b[2]>>4)&0xFF;buf[23]=(b[2]>>12)&0xFF;buf[22]=(b[2]>>20)&0x3F; + buf[22]|=(b[3]&0x03)<<6;buf[21]=(b[3]>>2)&0xFF;buf[20]=(b[3]>>10)&0xFF;buf[19]=(b[3]>>18)&0xFF; + buf[18]=(b[4]>>0)&0xFF;buf[17]=(b[4]>>8)&0xFF;buf[16]=(b[4]>>16)&0xFF;buf[15]=(b[4]>>24)&0x03; + buf[15]|=(b[5]&0x3F)<<2;buf[14]=(b[5]>>6)&0xFF;buf[13]=(b[5]>>14)&0xFF;buf[12]=(b[5]>>22)&0x0F; + buf[12]|=(b[6]&0x0F)<<4;buf[11]=(b[6]>>4)&0xFF;buf[10]=(b[6]>>12)&0xFF;buf[9]=(b[6]>>20)&0x3F; + buf[9]|=(b[7]&0x03)<<6;buf[8]=(b[7]>>2)&0xFF;buf[7]=(b[7]>>10)&0xFF;buf[6]=(b[7]>>18)&0xFF; + buf[5]=(b[8]>>0)&0xFF;buf[4]=(b[8]>>8)&0xFF;buf[3]=(b[8]>>16)&0xFF;buf[2]=(b[8]>>24)&0x03; + buf[2]|=(b[9]&0x3F)<<2;buf[1]=(b[9]>>6)&0xFF;buf[0]=(b[9]>>14)&0xFF; +} + +// ---- base64url no-pad ---- +__constant__ uint8_t B64[64] = {'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','0','1','2','3','4','5','6','7','8','9','-','_'}; + +// Encode r_bytes[30..31] || s_bytes[0..31] (34 bytes) -> 46 chars +static __device__ void b64_raw_sig(uint8_t *out, uint8_t r0, uint8_t r1, const uint8_t *s) { + out[0]=B64[(r0>>2)&0x3f]; + out[1]=B64[((r0<<4)|(r1>>4))&0x3f]; + out[2]=B64[((r1<<2)|(s[0]>>6))&0x3f]; + out[3]=B64[s[0]&0x3f]; + #pragma unroll + for (int i=0;i<10;i++){ + uint8_t a=s[1+i*3],b=s[2+i*3],c=s[3+i*3]; + out[4+i*4]=B64[(a>>2)&0x3f]; + out[5+i*4]=B64[((a<<4)|(b>>4))&0x3f]; + out[6+i*4]=B64[((b<<2)|(c>>6))&0x3f]; + out[7+i*4]=B64[c&0x3f]; + } + out[44]=B64[(s[31]>>2)&0x3f]; + out[45]=B64[(s[31]<<4)&0x3f]; +} + +// ---- base32 multibase ---- +__constant__ uint8_t B32[32] = {'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','2','3','4','5','6','7'}; + +static __device__ void b32_encode(uint8_t *out, const uint8_t *data, uint32_t len) { + uint32_t i=0; + while (i+4>3)&0x1f];*out++=B32[((a<<2)|(b>>6))&0x1f]; + *out++=B32[(b>>1)&0x1f];*out++=B32[((b<<4)|(c>>4))&0x1f]; + *out++=B32[((c<<1)|(d>>7))&0x1f];*out++=B32[(d>>2)&0x1f]; + *out++=B32[((d<<3)|(e>>5))&0x1f];*out++=B32[e&0x1f]; + } + uint32_t rem=len-i; + if(rem==4){uint8_t a=data[i],b=data[i+1],c=data[i+2],d=data[i+3]; + *out++=B32[(a>>3)&0x1f];*out++=B32[((a<<2)|(b>>6))&0x1f]; + *out++=B32[(b>>1)&0x1f];*out++=B32[((b<<4)|(c>>4))&0x1f]; + *out++=B32[((c<<1)|(d>>7))&0x1f];*out++=B32[(d>>2)&0x1f];*out++=B32[(d<<3)&0x1f];} + else if(rem==3){uint8_t a=data[i],b=data[i+1],c=data[i+2]; + *out++=B32[(a>>3)&0x1f];*out++=B32[((a<<2)|(b>>6))&0x1f]; + *out++=B32[(b>>1)&0x1f];*out++=B32[((b<<4)|(c>>4))&0x1f];*out++=B32[(c<<1)&0x1f];} + else if(rem==2){uint8_t a=data[i],b=data[i+1]; + *out++=B32[(a>>3)&0x1f];*out++=B32[((a<<2)|(b>>6))&0x1f]; + *out++=B32[(b>>1)&0x1f];*out++=B32[(b<<4)&0x1f];} + else if(rem==1){uint8_t a=data[i]; + *out++=B32[(a>>3)&0x1f];*out++=B32[(a<<2)&0x1f];} +} + +// ---- main kernel ---- +__global__ __launch_bounds__(BLOCK_SIZE) +void mine_plc( + const uint32_t * __restrict__ limb_table, + const uint8_t * __restrict__ r_b64_tbl, + const uint8_t * __restrict__ r_tail, + uint8_t * __restrict__ results, + uint32_t * __restrict__ result_count, + uint32_t handle_base, + uint32_t row_base, + uint32_t num_rows +) { + uint32_t gid = blockIdx.x * blockDim.x + threadIdx.x; + uint32_t handle_idx = handle_base + gid; + + uint8_t handle[6]; + {uint32_t idx=handle_idx;for(int j=0;j<6;j++){handle[5-j]=B64[idx&0x3f];idx>>=6;}} + + uint8_t presigned[PRESIGNED_LEN]; + #pragma unroll + for (int i=0;i>24)&0xFF;z_bytes[w*4+1]=(z_state[w]>>16)&0xFF; + z_bytes[w*4+2]=(z_state[w]>> 8)&0xFF;z_bytes[w*4+3]=(z_state[w] )&0xFF; + } + uint32_t z[10]; + bigint_unpack(z, z_bytes); + + uint8_t signed_op[SIGNED_LEN]; + #pragma unroll + for (int i=0;inum_rows) row_end=num_rows; + + for (uint32_t row=row_base; row>24)&0xFF; +#ifdef USE_BITMAP_FILTER + if (!(d_firstbyte_bitmap[b0>>5]&(1u<<(b0&31)))) continue; +#else + {int hit=0;for(uint32_t p=0;p>sh)&0xFF;} + uint8_t did_b32[24]; + b32_encode(did_b32, did_hash, 15); + + for (uint32_t p=0;p>24)&0xFF;results[roff+9]=(row>>16)&0xFF; + results[roff+10]=(row>>8)&0xFF;results[roff+11]=row&0xFF; + uint8_t k_inv_bytes[32]; + bigint_pack(k_inv_bytes, k_inv_l); + for (int b=0;b<32;b++) results[roff+12+b]=k_inv_bytes[b]; + for (int b=0;b<24;b++) results[roff+44+b]=did_b32[b]; + } + } + } +} + +// ---- host utilities ---- + +static void cuda_check(cudaError_t err, const char *where) { + if (err!=cudaSuccess){ + fprintf(stderr,"CUDA error at %s: %s\n",where,cudaGetErrorString(err)); + exit(1); + } +} +#define CUDA_CHECK(x) cuda_check((x), #x) + +static double timestamp(void) { + struct timespec ts; + clock_gettime(CLOCK_REALTIME,&ts); + return ts.tv_sec+ts.tv_nsec*1e-9; +} + +// load precomputed.bin -> device buffers +// each row = [r(32), k_inv_rDa(32), k_inv(32)] = 96 bytes +static void load_precomputed(const char *path, + uint32_t **d_limb_table, uint8_t **d_r_b64, uint8_t **d_r_tail, + size_t *num_rows_out) +{ + static const char b64c[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"; + + FILE *f=fopen(path,"rb"); + if (!f){fprintf(stderr,"cannot open %s\n",path);exit(1);} + fseek(f,0,SEEK_END); + long sz=ftell(f);rewind(f); + assert(sz%96==0); + size_t num_rows=sz/96; + uint8_t *raw=(uint8_t*)malloc(sz); + assert(fread(raw,sz,1,f)==1); + fclose(f); + + uint32_t *limb_table=(uint32_t*)malloc(num_rows*20*sizeof(uint32_t)); + uint8_t *r_b64 =(uint8_t *)malloc(num_rows*40); + uint8_t *r_tail =(uint8_t *)malloc(num_rows*2); + + for (size_t i=0;i 40 chars, no remainder) + for (int g=0;g<10;g++){ + uint8_t a=row[g*3],b=row[g*3+1],c=row[g*3+2]; + r_b64[i*40+g*4+0]=b64c[(a>>2)&0x3f]; + r_b64[i*40+g*4+1]=b64c[((a<<4)|(b>>4))&0x3f]; + r_b64[i*40+g*4+2]=b64c[((b<<2)|(c>>6))&0x3f]; + r_b64[i*40+g*4+3]=b64c[c&0x3f]; + } + r_tail[i*2 ]=row[30]; + r_tail[i*2+1]=row[31]; + } + free(raw); + + CUDA_CHECK(cudaMalloc(d_limb_table, num_rows*20*sizeof(uint32_t))); + CUDA_CHECK(cudaMalloc(d_r_b64, num_rows*40)); + CUDA_CHECK(cudaMalloc(d_r_tail, num_rows*2)); + CUDA_CHECK(cudaMemcpy(*d_limb_table, limb_table, num_rows*20*sizeof(uint32_t), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(*d_r_b64, r_b64, num_rows*40, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(*d_r_tail, r_tail, num_rows*2, cudaMemcpyHostToDevice)); + free(limb_table);free(r_b64);free(r_tail); + *num_rows_out=num_rows; +} + +static uint8_t prefix_firstbyte(const char *p) { + static const char *b32c="abcdefghijklmnopqrstuvwxyz234567"; + int c0=(int)(strchr(b32c,p[0])-b32c); + int c1=(int)(strchr(b32c,p[1])-b32c); + return (uint8_t)((c0<<3)|(c1>>2)); +} + +int main(int argc, char *argv[]) { + if (argc<5){ + fprintf(stderr,"USAGE: %s precomputed.bin did:key: [prefix2 ...]\n",argv[0]); + return 1; + } + const char *precomputed_path=argv[2]; + const char *pubkey=argv[3]; + int num_prefixes=argc-4; + char **prefixes=argv+4; + + if (strlen(pubkey)!=57){fprintf(stderr,"invalid pubkey length\n");return 1;} + for (int i=0;i8){fprintf(stderr,"prefix too long (>8)\n");return 1;} + } + if (num_prefixes>MAX_PREFIXES){fprintf(stderr,"too many prefixes (max %d)\n",MAX_PREFIXES);return 1;} + + // ---- build templates ---- + uint8_t presigned_tpl[PRESIGNED_LEN]; + { + char buf[256]; + int n=snprintf(buf,sizeof(buf), + "\xa6""dprev\xf6""dtypemplc_operationhservices\xa0""kalsoKnownAs\x81""kat://AAAAAAlrotationKeys\x81""x9%ssverificationMethods\xa0", + pubkey); + assert(n==PRESIGNED_LEN); + memcpy(presigned_tpl,buf,PRESIGNED_LEN); + } + + uint8_t signed_tpl[SIGNED_LEN]; + { + const uint8_t header[] = "\xa7""csigxV"; // 7 bytes + const uint8_t mid[] = "dprev\xf6""dtypemplc_operationhservices\xa0""kalsoKnownAs\x81""kat://AAAAAAlrotationKeys\x81""x9"; + const uint8_t tail[] = "sverificationMethods\xa0"; // 21 bytes + uint8_t *p=signed_tpl; + memcpy(p,header,7); p+=7; + memset(p,'A',86); p+=86; // sig placeholder + memcpy(p,mid,sizeof(mid)-1); p+=sizeof(mid)-1; + assert((p-signed_tpl)==SIGNED_PUBKEY_OFF); + memcpy(p,pubkey,57); p+=57; + memcpy(p,tail,21); p+=21; + assert((p-signed_tpl)==SIGNED_LEN); + } + + // ---- upload templates to constant memory ---- + CUDA_CHECK(cudaMemcpyToSymbol(d_presigned_tpl, presigned_tpl, PRESIGNED_LEN)); + CUDA_CHECK(cudaMemcpyToSymbol(d_signed_tpl, signed_tpl, SIGNED_LEN)); + + // ---- upload prefix data ---- + { + uint8_t fb[MAX_PREFIXES]={0}; + uint8_t pdata[MAX_PREFIX_DATA]={0}; + uint32_t plens[MAX_PREFIXES]={0}; + uint32_t poffs[MAX_PREFIXES]={0}; + uint32_t off=0; + for (int i=0;i>5]|=(1u<<(fb[i]&31)); + uint32_t np=(uint32_t)num_prefixes; + CUDA_CHECK(cudaMemcpyToSymbol(d_firstbyte_bitmap, bitmap, sizeof(bitmap))); + CUDA_CHECK(cudaMemcpyToSymbol(d_firstbytes, fb, sizeof(uint8_t)*num_prefixes)); + CUDA_CHECK(cudaMemcpyToSymbol(d_prefix_data, pdata, off)); + CUDA_CHECK(cudaMemcpyToSymbol(d_prefix_lens, plens, sizeof(uint32_t)*num_prefixes)); + CUDA_CHECK(cudaMemcpyToSymbol(d_prefix_offsets,poffs, sizeof(uint32_t)*num_prefixes)); + CUDA_CHECK(cudaMemcpyToSymbol(d_num_prefixes, &np, sizeof(uint32_t))); + } + + // ---- load precomputed table ---- + uint32_t *d_limb_table; uint8_t *d_r_b64, *d_r_tail; + size_t num_rows; + load_precomputed(precomputed_path, &d_limb_table, &d_r_b64, &d_r_tail, &num_rows); + fprintf(stderr,"imported %zu rows, WORK_SIZE=%d STEPS_PER_TASK=%d\n",num_rows,WORK_SIZE,STEPS_PER_TASK); + + // ---- allocate result buffers ---- + uint8_t *d_results; + uint32_t *d_result_count; + uint8_t h_results[MAX_RESULTS*RESULT_STRIDE]; + uint32_t h_result_count; + CUDA_CHECK(cudaMalloc(&d_results, MAX_RESULTS*RESULT_STRIDE)); + CUDA_CHECK(cudaMalloc(&d_result_count, sizeof(uint32_t))); + + // ---- mining loop ---- + CUDA_CHECK(cudaFuncSetAttribute(mine_plc, cudaFuncAttributePreferredSharedMemoryCarveout, 0)); + + setlocale(LC_NUMERIC,""); + uint64_t total_plcs=0, total_found=0; + double start=timestamp(); + uint64_t handle_base=0; + int blocks=WORK_SIZE/BLOCK_SIZE; + + while (1) { + for (uint32_t row_base=0; row_base<(uint32_t)num_rows; row_base+=STEPS_PER_TASK) { + + h_result_count=0; + CUDA_CHECK(cudaMemcpy(d_result_count,&h_result_count,sizeof(uint32_t),cudaMemcpyHostToDevice)); + + mine_plc<<>>( + d_limb_table, d_r_b64, d_r_tail, + d_results, d_result_count, + (uint32_t)handle_base, row_base, (uint32_t)num_rows + ); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); + + CUDA_CHECK(cudaMemcpy(&h_result_count,d_result_count,sizeof(uint32_t),cudaMemcpyDeviceToHost)); + + uint32_t n=h_result_count0){ + CUDA_CHECK(cudaMemcpy(h_results,d_results,n*RESULT_STRIDE,cudaMemcpyDeviceToHost)); + for (uint32_t i=0;i= (1ULL<<32)) { + fprintf(stderr,"\nhandle space exhausted (2^32 handles checked)\n"); + goto done; + } + } + done:; +}