refactor(ai-parser): improve prompts, schemas, and context-aware parsing

+1 -1

packages/ai-parser/package.json

··· 13 13 "dependencies": { 14 14 "@cv/ai-provider": "*", 15 15 "@nestjs/common": "^11.1.3", 16 - "zod": "^3.23.8" 16 + "zod": "^4.3.6" 17 17 }, 18 18 "peerDependencies": { 19 19 "@nestjs/common": "^11.0.0"

+53 -29

packages/ai-parser/src/__tests__/ai-parser.service.spec.ts

··· 9 9 const validParsedCVData: ParsedCVData = { 10 10 personalInfo: { 11 11 name: "John Doe", 12 + headline: undefined, 12 13 introduction: "Experienced software engineer", 14 + city: undefined, 15 + country: undefined, 16 + phone: undefined, 17 + website: undefined, 18 + linkedInUrl: undefined, 13 19 }, 14 20 jobExperiences: [ 15 21 { ··· 29 35 fieldOfStudy: "Computer Science", 30 36 startDate: "2012-09-01", 31 37 endDate: "2016-05-31", 38 + description: undefined, 32 39 skills: ["Algorithms", "Data Structures"], 33 40 }, 34 41 ], ··· 59 66 expect.objectContaining({ 60 67 prompt: expect.stringContaining("Sample CV text here"), 61 68 temperature: 0.1, 62 - maxTokens: 2048, 69 + maxTokens: 8192, 63 70 }), 64 71 ); 65 72 expect(result.personalInfo?.name).toBe("John Doe"); ··· 136 143 ); 137 144 }); 138 145 139 - it("throws error when LLM returns incomplete JSON", async () => { 146 + it("throws error when LLM response is truncated", async () => { 140 147 const mockResponse: AICompletionResponse = { 141 148 content: '{"jobExperiences": [{"companyName": "Tech', 142 149 finishReason: "length", ··· 144 151 mockProvider.complete.mockResolvedValue(mockResponse); 145 152 146 153 await expect(service.parseCVText("Sample CV text")).rejects.toThrow( 147 - "Failed to parse LLM response as JSON", 154 + "LLM response was truncated", 148 155 ); 149 156 }); 150 157 ··· 188 195 189 196 const result = await service.parseCVText("Sample CV text"); 190 197 191 - expect(result.personalInfo).toBeUndefined(); 198 + expect(result.personalInfo).toEqual({}); 192 199 expect(result.jobExperiences).toHaveLength(1); 193 - expect(result.jobExperiences[0].skills).toEqual([]); 200 + expect(result.jobExperiences[0]!.skills).toEqual([]); 194 201 expect(result.education).toEqual([]); 195 202 expect(result.skills).toEqual([]); 196 203 }); ··· 214 221 215 222 const result = await service.parseCVText("Sample CV text"); 216 223 217 - expect(result.jobExperiences[0].endDate).toBeNull(); 224 + expect(result.jobExperiences[0]!.endDate).toBeNull(); 218 225 }); 219 226 220 227 it("trims whitespace from optional string fields", async () => { ··· 241 248 242 249 const result = await service.parseCVText("Sample CV text"); 243 250 244 - expect(result.personalInfo?.name).toBe("John Doe"); 245 - expect(result.personalInfo?.introduction).toBe("Senior developer"); 246 - expect(result.jobExperiences[0].levelName).toBe("Senior"); 247 - expect(result.jobExperiences[0].description).toBe("Led team"); 251 + expect(result.personalInfo.name).toBe("John Doe"); 252 + expect(result.personalInfo.introduction).toBe("Senior developer"); 253 + expect(result.jobExperiences[0]!.levelName).toBe("Senior"); 254 + expect(result.jobExperiences[0]!.description).toBe("Led team"); 248 255 }); 249 256 }); 250 257 ··· 265 272 ); 266 273 }); 267 274 268 - it("uses default maxTokens of 2048", async () => { 275 + it("uses default maxTokens of 8192", async () => { 269 276 const mockResponse: AICompletionResponse = { 270 277 content: JSON.stringify(validParsedCVData), 271 278 finishReason: "stop", ··· 276 283 277 284 expect(mockProvider.complete).toHaveBeenCalledWith( 278 285 expect.objectContaining({ 279 - maxTokens: 2048, 286 + maxTokens: 8192, 280 287 }), 281 288 ); 282 289 }); ··· 319 326 ); 320 327 }); 321 328 322 - it("includes stop sequences in request", async () => { 329 + }); 330 + 331 + describe("prompt generation", () => { 332 + it("includes CV text in the prompt", async () => { 333 + const cvText = "This is my career story with specific details"; 323 334 const mockResponse: AICompletionResponse = { 324 335 content: JSON.stringify(validParsedCVData), 325 336 finishReason: "stop", 326 337 }; 327 338 mockProvider.complete.mockResolvedValue(mockResponse); 328 339 329 - await service.parseCVText("Sample CV"); 340 + await service.parseCVText(cvText); 330 341 331 342 expect(mockProvider.complete).toHaveBeenCalledWith( 332 343 expect.objectContaining({ 333 - stopSequences: ["</s>"], 344 + prompt: expect.stringContaining(cvText), 334 345 }), 335 346 ); 336 347 }); 337 - }); 348 + 349 + it("includes existing user context in prompt when provided", async () => { 350 + const mockResponse: AICompletionResponse = { 351 + content: JSON.stringify(validParsedCVData), 352 + finishReason: "stop", 353 + }; 354 + mockProvider.complete.mockResolvedValue(mockResponse); 355 + 356 + await service.parseCVText("Sample CV", { 357 + name: "Jane Smith", 358 + jobs: [{ company: "ACME", role: "Engineer", startDate: "2020-01-01" }], 359 + skills: ["TypeScript", "React"], 360 + }); 338 361 339 - describe("prompt generation", () => { 340 - it("includes CV text in the prompt", async () => { 341 - const cvText = "This is my career story with specific details"; 362 + const calledWith = mockProvider.complete.mock.calls[0]![0]; 363 + expect(calledWith.prompt).toContain("Jane Smith"); 364 + expect(calledWith.prompt).toContain("ACME"); 365 + expect(calledWith.prompt).toContain("TypeScript, React"); 366 + }); 367 + 368 + it("does not include context block when no context provided", async () => { 342 369 const mockResponse: AICompletionResponse = { 343 370 content: JSON.stringify(validParsedCVData), 344 371 finishReason: "stop", 345 372 }; 346 373 mockProvider.complete.mockResolvedValue(mockResponse); 347 374 348 - await service.parseCVText(cvText); 375 + await service.parseCVText("Sample CV"); 349 376 350 - expect(mockProvider.complete).toHaveBeenCalledWith( 351 - expect.objectContaining({ 352 - prompt: expect.stringContaining(cvText), 353 - }), 354 - ); 377 + const calledWith = mockProvider.complete.mock.calls[0]![0]; 378 + expect(calledWith.prompt).not.toContain("Existing User Data"); 355 379 }); 356 380 357 - it("includes JSON structure example in prompt", async () => { 381 + it("includes JSON structure example in system prompt", async () => { 358 382 const mockResponse: AICompletionResponse = { 359 383 content: JSON.stringify(validParsedCVData), 360 384 finishReason: "stop", ··· 365 389 366 390 expect(mockProvider.complete).toHaveBeenCalledWith( 367 391 expect.objectContaining({ 368 - prompt: expect.stringContaining("jobExperiences"), 392 + systemPrompt: expect.stringContaining("jobExperiences"), 369 393 }), 370 394 ); 371 395 }); 372 396 373 - it("includes parsing instructions in prompt", async () => { 397 + it("includes parsing instructions in system prompt", async () => { 374 398 const mockResponse: AICompletionResponse = { 375 399 content: JSON.stringify(validParsedCVData), 376 400 finishReason: "stop", ··· 381 405 382 406 expect(mockProvider.complete).toHaveBeenCalledWith( 383 407 expect.objectContaining({ 384 - prompt: expect.stringContaining("ISO 8601"), 408 + systemPrompt: expect.stringContaining("ISO 8601"), 385 409 }), 386 410 ); 387 411 });

+2 -30

packages/ai-parser/src/ai-parser.service.test.ts

··· 34 34 const result = await parser.parseCVText('Sample CV text'); 35 35 36 36 expect(result.personalInfo.name).toBe('John Doe'); 37 - expect(mockProvider.complete).toHaveBeenCalledWith( 38 - expect.objectContaining({ 39 - stopSequences: ['</s>'], // Should NOT include '\n\n' 40 - }) 41 - ); 42 37 }); 43 38 44 39 it('should extract raw JSON without markdown', async () => { ··· 57 52 const result = await parser.parseCVText('Sample CV text'); 58 53 59 54 expect(result.personalInfo.name).toBe('Jane Smith'); 60 - }); 61 - }); 62 - 63 - describe('Stop Sequences', () => { 64 - it('should NOT use double newline as stop sequence', async () => { 65 - const mockProvider = createMockProvider(` 66 - { 67 - "personalInfo": {"name": "Test"}, 68 - "jobExperiences": [], 69 - "education": [], 70 - "skills": [] 71 - } 72 - `); 73 - 74 - const parser = new CVParserService(mockProvider); 75 - await parser.parseCVText('Sample CV'); 76 - 77 - // Verify that '\n\n' is NOT in stop sequences 78 - expect(mockProvider.complete).toHaveBeenCalledWith( 79 - expect.objectContaining({ 80 - stopSequences: expect.not.arrayContaining(['\n\n']), 81 - }) 82 - ); 83 55 }); 84 56 }); 85 57 ··· 153 125 154 126 expect(result.personalInfo.name).toBe('John Doe'); 155 127 expect(result.jobExperiences).toHaveLength(1); 156 - expect(result.jobExperiences[0].companyName).toBe('Tech Corp'); 157 - expect(result.jobExperiences[0].endDate).toBeNull(); 128 + expect(result.jobExperiences[0]!.companyName).toBe('Tech Corp'); 129 + expect(result.jobExperiences[0]!.endDate).toBeNull(); 158 130 expect(result.education).toHaveLength(1); 159 131 expect(result.skills).toHaveLength(4); 160 132 });

+20 -7

packages/ai-parser/src/ai-parser.service.ts

··· 1 1 import type { AIProvider } from '@cv/ai-provider'; 2 2 import { ParsedCVDataSchema, type ParsedCVData } from './schemas'; 3 - import { getCV_PARSING_PROMPT } from './prompts'; 3 + import { 4 + CV_SYSTEM_PROMPT, 5 + buildCvUserPrompt, 6 + type ExistingUserContext, 7 + } from './prompts'; 4 8 5 9 /** 6 10 * Configuration for CV parser service ··· 24 28 constructor(provider: AIProvider, config?: CVParserConfig) { 25 29 this.provider = provider; 26 30 this.temperature = config?.temperature ?? 0.1; 27 - this.maxTokens = config?.maxTokens ?? 2048; 31 + this.maxTokens = config?.maxTokens ?? 8192; 28 32 } 29 33 30 34 /** 31 35 * Parse CV text using the AI provider 32 36 * @param cvText Raw text from CV (extracted from PDF, DOCX, etc.) 37 + * @param context Optional existing user data for deduplication and disambiguation 33 38 * @returns Structured CV data matching ParsedCVDataSchema 34 39 */ 35 - async parseCVText(cvText: string): Promise<ParsedCVData> { 40 + async parseCVText( 41 + cvText: string, 42 + context?: ExistingUserContext, 43 + ): Promise<ParsedCVData> { 36 44 if (!cvText || cvText.trim().length === 0) { 37 45 throw new Error('CV text cannot be empty'); 38 46 } 39 47 40 - const prompt = getCV_PARSING_PROMPT(cvText); 41 - 42 48 try { 43 49 const response = await this.provider.complete({ 44 - prompt, 50 + systemPrompt: CV_SYSTEM_PROMPT, 51 + prompt: buildCvUserPrompt(cvText, context), 45 52 temperature: this.temperature, 46 53 maxTokens: this.maxTokens, 47 - stopSequences: ['</s>'], 48 54 }); 55 + 56 + if (response.finishReason === 'length') { 57 + throw new Error( 58 + 'LLM response was truncated (hit max token limit). ' + 59 + `Increase maxTokens (currently ${this.maxTokens}) to allow longer responses.` 60 + ); 61 + } 49 62 50 63 // Extract JSON from response (handle markdown code blocks) 51 64 const rawJson = this.extractJson(response.content);

+28 -4

packages/ai-parser/src/cv-parser.module.ts

··· 1 - import { DynamicModule, Module } from '@nestjs/common'; 2 - import { AI_PROVIDER, AIModule, type AIModuleOptions, type AIProvider } from '@cv/ai-provider'; 3 - import { CVParserService } from './ai-parser.service'; 1 + import { DynamicModule, Module } from "@nestjs/common"; 2 + import { 3 + AI_PROVIDER, 4 + AIModule, 5 + type AIModuleOptions, 6 + type AIProvider, 7 + } from "@cv/ai-provider"; 8 + import { CVParserService } from "./ai-parser.service"; 4 9 5 - export const CV_PARSER_SERVICE = Symbol('CV_PARSER_SERVICE'); 10 + export const CV_PARSER_SERVICE = Symbol("CV_PARSER_SERVICE"); 6 11 7 12 @Module({}) 8 13 export class CVParserModule { ··· 10 15 return { 11 16 module: CVParserModule, 12 17 imports: [AIModule.forRoot(aiOptions)], 18 + providers: [ 19 + { 20 + provide: CV_PARSER_SERVICE, 21 + inject: [AI_PROVIDER], 22 + useFactory: (aiProvider: AIProvider): CVParserService => 23 + new CVParserService(aiProvider), 24 + }, 25 + ], 26 + exports: [CV_PARSER_SERVICE], 27 + }; 28 + } 29 + 30 + /** 31 + * Resolve the AI provider type from ConfigService at factory time. 32 + */ 33 + static forConfig(): DynamicModule { 34 + return { 35 + module: CVParserModule, 36 + imports: [AIModule.forConfig()], 13 37 providers: [ 14 38 { 15 39 provide: CV_PARSER_SERVICE,

+8 -1

packages/ai-parser/src/index.ts

··· 9 9 } from './schemas'; 10 10 11 11 // Prompts 12 - export { CV_PARSING_PROMPT, getCV_PARSING_PROMPT } from './prompts'; 12 + export { 13 + CV_SYSTEM_PROMPT, 14 + buildCvUserPrompt, 15 + buildContextBlock, 16 + CV_PARSING_PROMPT, 17 + getCV_PARSING_PROMPT, 18 + type ExistingUserContext, 19 + } from './prompts'; 13 20 14 21 // Service 15 22 export { CVParserService, type CVParserConfig } from './ai-parser.service';

+104 -13

packages/ai-parser/src/prompts.ts

··· 1 1 /** 2 - * System prompt for CV parsing 3 - * Instructs the LLM to extract structured information from CV text 2 + * Existing user data to provide as context to the AI parser. 3 + * Helps with deduplication, gap-filling, and disambiguation. 4 + */ 5 + export interface ExistingUserContext { 6 + name?: string; 7 + headline?: string; 8 + city?: string; 9 + country?: string; 10 + jobs?: Array<{ 11 + company: string; 12 + role: string; 13 + startDate: string; 14 + endDate?: string; 15 + }>; 16 + education?: Array<{ 17 + institution: string; 18 + degree: string; 19 + startDate: string; 20 + endDate?: string; 21 + }>; 22 + skills?: string[]; 23 + } 24 + 25 + /** 26 + * Format a date range for display in the context block. 27 + */ 28 + const formatRange = (start: string, end?: string): string => 29 + `${start} - ${end ?? "present"}`; 30 + 31 + /** 32 + * Build a prompt section with existing user data for context. 33 + * Returns empty string when context has no meaningful data. 34 + */ 35 + export const buildContextBlock = (context: ExistingUserContext): string => { 36 + const lines: string[] = []; 37 + 38 + if (context.name) lines.push(`Name: ${context.name}`); 39 + if (context.headline) lines.push(`Headline: ${context.headline}`); 40 + if (context.city || context.country) { 41 + lines.push( 42 + `Location: ${[context.city, context.country].filter(Boolean).join(", ")}`, 43 + ); 44 + } 45 + 46 + if (context.jobs?.length) { 47 + const jobList = context.jobs 48 + .map((j) => ` - ${j.role} at ${j.company} (${formatRange(j.startDate, j.endDate)})`) 49 + .join("\n"); 50 + lines.push(`Current jobs:\n${jobList}`); 51 + } 52 + 53 + if (context.education?.length) { 54 + const eduList = context.education 55 + .map((e) => ` - ${e.degree} at ${e.institution} (${formatRange(e.startDate, e.endDate)})`) 56 + .join("\n"); 57 + lines.push(`Education:\n${eduList}`); 58 + } 59 + 60 + if (context.skills?.length) { 61 + lines.push(`Known skills: ${context.skills.join(", ")}`); 62 + } 63 + 64 + return lines.length === 0 65 + ? "" 66 + : `\n## Existing User Data (for reference — merge, don't duplicate)\n${lines.join("\n")}`; 67 + }; 68 + 69 + /** 70 + * System prompt for CV parsing. 71 + * Instructs the LLM to extract structured information from CV text. 4 72 */ 5 - export const CV_PARSING_PROMPT = `You are a professional CV parser. Your task is to extract structured information from the provided CV text and return it as a JSON object. 73 + export const CV_SYSTEM_PROMPT = `You are a professional CV parser. Your task is to extract structured information from the provided CV text and return it as a JSON object. 6 74 7 75 Extract the following information: 8 - 1. Personal info: name and introduction/summary 76 + 1. Personal info: name, headline/title, introduction/summary, city, country, phone, website, and LinkedIn URL 9 77 2. Work experience: for each job, extract company, role, level, dates, description, and skills mentioned 10 78 3. Education: for each entry, extract institution, degree, field of study, dates, description, and skills 11 79 4. Skills: list of all mentioned skills ··· 15 83 - All dates must be in ISO 8601 format (YYYY-MM-DD) 16 84 - For current positions, set endDate to null 17 85 - Skills should be extracted as an array of strings 18 - - If a field is not found, omit it from the object (except for arrays, which default to []) 19 86 - Company/institution names should be exact as written in the CV 20 87 - Keep descriptions concise (1-2 sentences) 88 + - The personalInfo object is REQUIRED — always include it, even if only the name is available. Extract whatever you can find; omit individual fields only if truly absent from the CV. 89 + - For personalInfo: headline is the professional title (e.g. "Senior Software Engineer"), introduction is a summary paragraph 90 + - If existing user data is provided, use it to disambiguate ambiguous entries (e.g. a teaching position at a university could be work or education — check existing data for clues) 91 + - If the CV contains entries that match existing user data, update/enrich them rather than creating duplicates 21 92 22 93 Example JSON structure: 23 94 { 24 95 "personalInfo": { 25 96 "name": "John Doe", 26 - "introduction": "Software engineer with 10 years of experience" 97 + "headline": "Senior Software Engineer", 98 + "introduction": "Software engineer with 10 years of experience", 99 + "city": "Amsterdam", 100 + "country": "Netherlands", 101 + "phone": "+31 6 1234 5678", 102 + "website": "https://johndoe.dev", 103 + "linkedInUrl": "https://linkedin.com/in/johndoe" 27 104 }, 28 105 "jobExperiences": [ 29 106 { ··· 49 126 "skills": ["Kubernetes", "Go", "Docker", "PostgreSQL", "C++", "Algorithms"] 50 127 } 51 128 52 - CV Text to parse: 53 - --- 54 - {cvText} 55 - --- 56 - 57 129 Return only the JSON object.`; 58 130 59 - export const getCV_PARSING_PROMPT = (cvText: string): string => { 60 - return CV_PARSING_PROMPT.replace('{cvText}', cvText); 131 + /** 132 + * Build the user prompt containing the CV text to parse. 133 + * Optionally includes existing user context for deduplication. 134 + */ 135 + export const buildCvUserPrompt = ( 136 + cvText: string, 137 + context?: ExistingUserContext, 138 + ): string => { 139 + const contextBlock = context ? buildContextBlock(context) : ""; 140 + return `${contextBlock}\n\nParse the following CV text:\n---\n${cvText}\n---`; 61 141 }; 142 + 143 + /** 144 + * @deprecated Use CV_SYSTEM_PROMPT + buildCvUserPrompt separately 145 + */ 146 + export const CV_PARSING_PROMPT = `${CV_SYSTEM_PROMPT}\n\nCV Text to parse:\n---\n{cvText}\n---\n\nReturn only the JSON object.`; 147 + 148 + /** 149 + * @deprecated Use CV_SYSTEM_PROMPT + buildCvUserPrompt separately 150 + */ 151 + export const getCV_PARSING_PROMPT = (cvText: string): string => 152 + CV_PARSING_PROMPT.replace("{cvText}", cvText);

+49 -11

packages/ai-parser/src/schemas.ts

··· 1 - import { z } from 'zod'; 1 + import { z } from "zod/v4"; 2 2 3 3 /** 4 4 * Schema for parsed job experience extracted from CV text ··· 8 8 roleName: z.string().min(1, 'Role name is required'), 9 9 levelName: z 10 10 .string() 11 - .optional() 11 + .nullish() 12 12 .transform((val) => val?.trim() || undefined), 13 13 startDate: z.string().min(1, 'Start date is required'), // ISO date string YYYY-MM-DD 14 14 endDate: z.string().nullable().optional(), // ISO date string or null for current position 15 15 description: z 16 16 .string() 17 - .optional() 17 + .nullish() 18 18 .transform((val) => val?.trim() || undefined), 19 19 skills: z.array(z.string()).default([]), 20 20 }); ··· 26 26 */ 27 27 export const ParsedEducationSchema = z.object({ 28 28 institutionName: z.string().min(1, 'Institution name is required'), 29 - degree: z.string().min(1, 'Degree is required'), 29 + degree: z 30 + .string() 31 + .nullish() 32 + .transform((val) => val?.trim() || "Unknown"), 30 33 fieldOfStudy: z 31 - .string() 32 - .optional() 33 - .transform((val) => val?.trim() || undefined), 34 + .union([z.string(), z.array(z.string())]) 35 + .nullish() 36 + .transform((val) => 37 + Array.isArray(val) ? val.join(", ") : val?.trim() || undefined, 38 + ), 34 39 startDate: z.string().min(1, 'Start date is required'), // ISO date string YYYY-MM-DD 35 40 endDate: z.string().nullable().optional(), // ISO date string or null for currently studying 36 41 description: z 37 42 .string() 38 - .optional() 43 + .nullish() 39 44 .transform((val) => val?.trim() || undefined), 40 45 skills: z.array(z.string()).default([]), 41 46 }); ··· 50 55 .object({ 51 56 name: z 52 57 .string() 53 - .optional() 58 + .nullish() 59 + .transform((val) => val?.trim() || undefined), 60 + headline: z 61 + .string() 62 + .nullish() 54 63 .transform((val) => val?.trim() || undefined), 55 64 introduction: z 56 65 .string() 57 - .optional() 66 + .nullish() 67 + .transform((val) => val?.trim() || undefined), 68 + city: z 69 + .string() 70 + .nullish() 71 + .transform((val) => val?.trim() || undefined), 72 + country: z 73 + .string() 74 + .nullish() 75 + .transform((val) => val?.trim() || undefined), 76 + phone: z 77 + .string() 78 + .nullish() 79 + .transform((val) => val?.trim() || undefined), 80 + website: z 81 + .string() 82 + .nullish() 83 + .transform((val) => val?.trim() || undefined), 84 + linkedInUrl: z 85 + .string() 86 + .nullish() 58 87 .transform((val) => val?.trim() || undefined), 59 88 }) 60 - .optional(), 89 + .default({ 90 + name: undefined, 91 + headline: undefined, 92 + introduction: undefined, 93 + city: undefined, 94 + country: undefined, 95 + phone: undefined, 96 + website: undefined, 97 + linkedInUrl: undefined, 98 + }), 61 99 jobExperiences: z.array(ParsedJobExperienceSchema).default([]), 62 100 education: z.array(ParsedEducationSchema).default([]), 63 101 skills: z.array(z.string()).default([]),

Configure Feed

Configure Feed