perf: chunk bulk inserts during backfill and bump heap to 512MB

+3

package.json

··· 12 12 "format": "oxfmt --write .", 13 13 "format:check": "oxfmt --check .", 14 14 "check": "tsc --noEmit && oxlint . && oxfmt --check .", 15 + "build": "npm run build -w @hatk/hatk", 16 + "publish:hatk": "npm publish -w @hatk/hatk --tag alpha", 17 + "release": "npm run build -w @hatk/hatk && npm publish -w @hatk/hatk --tag alpha", 15 18 "docs": "npm run dev -w @hatk/docs", 16 19 "docs:build": "npm run build -w @hatk/docs" 17 20 },

+1 -1

packages/hatk/package.json

··· 1 1 { 2 2 "name": "@hatk/hatk", 3 - "version": "0.0.1-alpha.6", 3 + "version": "0.0.1-alpha.8", 4 4 "license": "MIT", 5 5 "bin": { 6 6 "hatk": "dist/cli.js"

+27 -19

packages/hatk/src/backfill.ts

··· 194 194 // Walk MST to find all record paths 195 195 const entries = walkMst(blocks, commit.data.$link) 196 196 197 - const bulk: BulkRecord[] = [] 197 + // Delete existing records for this DID before re-importing so deletions are reflected 198 + for (const col of collections) { 199 + const schema = getSchema(col) 200 + if (!schema) continue 201 + await runSQL(`DELETE FROM ${schema.tableName} WHERE did = $1`, did) 202 + for (const child of schema.children) { 203 + await runSQL(`DELETE FROM ${child.tableName} WHERE parent_did = $1`, did) 204 + } 205 + for (const union of schema.unions) { 206 + for (const branch of union.branches) { 207 + await runSQL(`DELETE FROM ${branch.tableName} WHERE parent_did = $1`, did) 208 + } 209 + } 210 + } 211 + 212 + // Insert records in chunks to limit memory usage 213 + const CHUNK_SIZE = 1000 214 + let chunk: BulkRecord[] = [] 198 215 for (const entry of entries) { 199 216 const collection = entry.path.split('/')[0] 200 217 if (!collections.has(collection)) continue ··· 208 225 209 226 const rkey = entry.path.split('/').slice(1).join('/') 210 227 const uri = `at://${did}/${collection}/${rkey}` 211 - bulk.push({ collection, uri, cid: entry.cid, did, record }) 228 + chunk.push({ collection, uri, cid: entry.cid, did, record }) 229 + 230 + if (chunk.length >= CHUNK_SIZE) { 231 + count += await bulkInsertRecords(chunk) 232 + chunk = [] 233 + } 212 234 } catch (recordErr: any) { 213 235 emit('backfill', 'record_error', { 214 236 did, ··· 218 240 }) 219 241 } 220 242 } 221 - blocks = null // free block map before bulk insert 222 - 223 - // Delete existing records for this DID before re-importing so deletions are reflected 224 - for (const col of collections) { 225 - const schema = getSchema(col) 226 - if (!schema) continue 227 - await runSQL(`DELETE FROM ${schema.tableName} WHERE did = $1`, did) 228 - for (const child of schema.children) { 229 - await runSQL(`DELETE FROM ${child.tableName} WHERE parent_did = $1`, did) 230 - } 231 - for (const union of schema.unions) { 232 - for (const branch of union.branches) { 233 - await runSQL(`DELETE FROM ${branch.tableName} WHERE parent_did = $1`, did) 234 - } 235 - } 243 + blocks = null // free block map 244 + if (chunk.length > 0) { 245 + count += await bulkInsertRecords(chunk) 236 246 } 237 - 238 - count = await bulkInsertRecords(bulk) 239 247 await setRepoStatus(did, 'active', commit.rev, { handle }) 240 248 return count 241 249 } catch (err: any) {

+1 -1

packages/hatk/src/cli.ts

··· 985 985 RUN node_modules/.bin/hatk build 986 986 RUN npm prune --omit=dev 987 987 EXPOSE 3000 988 - CMD ["node", "--max-old-space-size=256", "node_modules/@hatk/hatk/dist/main.js", "config.yaml"] 988 + CMD ["node", "--max-old-space-size=512", "node_modules/@hatk/hatk/dist/main.js", "config.yaml"] 989 989 `, 990 990 ) 991 991

Configure Feed

Configure Feed