From 0ad656ce35510a98f9acd08358c5e1fb1b1925cf Mon Sep 17 00:00:00 2001 From: arifal Date: Sat, 23 Aug 2025 03:25:15 +0700 Subject: [PATCH] try fix similaruuty and add seed for master excel icd --- .env.example | 4 +- docker-compose.yml | 7 +- docs/ENVIRONMENT_VARIABLES.md | 73 ++++ docs/SIMILARITY_THRESHOLD.md | 243 +++++++++++ package.json | 6 +- .../seed/icd}/[PUBLIC] ICD-10 e-klaim.xlsx | Bin .../seed/icd}/[PUBLIC] ICD-9CM e-klaim.xlsx | Bin prisma/seed/icd/icd.seed.ts | 170 ++++++++ prisma/seed/seed.ts | 27 ++ src/health/health.controller.ts | 4 +- src/icd/icd.controller.ts | 38 +- src/icd/icd.service.ts | 149 +------ src/icd/pgvector.controller.ts | 369 +++++++++++++++- src/icd/pgvector.service.ts | 407 ++++++++++++++++-- 14 files changed, 1274 insertions(+), 223 deletions(-) create mode 100644 docs/ENVIRONMENT_VARIABLES.md create mode 100644 docs/SIMILARITY_THRESHOLD.md rename {test => prisma/seed/icd}/[PUBLIC] ICD-10 e-klaim.xlsx (100%) rename {test => prisma/seed/icd}/[PUBLIC] ICD-9CM e-klaim.xlsx (100%) create mode 100644 prisma/seed/icd/icd.seed.ts create mode 100644 prisma/seed/seed.ts diff --git a/.env.example b/.env.example index 253338c..fd674bd 100644 --- a/.env.example +++ b/.env.example @@ -176,4 +176,6 @@ METRICS_PATH=/metrics HEALTH_CHECK_PATH=/health OPENAI_API_KEY=xxxxxx -OPENAI_API_MODEL=text-embedding-ada-002 \ No newline at end of file +OPENAI_API_MODEL=text-embedding-ada-002 + +VECTOR_SIMILARITY_THRESHOLD=0.85 \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 4f2b6db..ca68c81 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,7 +1,8 @@ +version: '3.8' + services: - # PostgreSQL Database with pgvector extension postgres: - image: pgvector/pgvector:pg15 + image: pgvector/pgvector:pg17 container_name: claim-guard-postgres restart: unless-stopped environment: @@ -9,7 +10,7 @@ services: POSTGRES_USER: postgres POSTGRES_PASSWORD: postgres123 ports: - - '5432:5432' + - '5433:5432' # host:container → akses dari host pakai port 5433 volumes: - postgres_data:/var/lib/postgresql/data - ./docker/postgres/init:/docker-entrypoint-initdb.d diff --git a/docs/ENVIRONMENT_VARIABLES.md b/docs/ENVIRONMENT_VARIABLES.md new file mode 100644 index 0000000..6a77544 --- /dev/null +++ b/docs/ENVIRONMENT_VARIABLES.md @@ -0,0 +1,73 @@ +# Environment Variables + +## Database Configuration +- `DATABASE_URL`: PostgreSQL connection string + - Example: `postgresql://username:password@localhost:5432/claim_guard_db` + +## OpenAI Configuration +- `OPENAI_API_KEY`: Your OpenAI API key for embeddings +- `OPENAI_API_MODEL`: OpenAI model for embeddings (default: `text-embedding-ada-002`) + +## Vector Search Configuration +- `VECTOR_SIMILARITY_THRESHOLD`: Minimum similarity threshold for vector search (default: `0.85`) + - Range: 0.0 to 1.0 + - Higher values = more strict matching + - Recommended: 0.85 for production, 0.7 for development + +## Application Configuration +- `PORT`: Application port (default: 3000) +- `NODE_ENV`: Environment mode (development/production) + +## Example .env file +```bash +# Database +DATABASE_URL="postgresql://username:password@localhost:5432/claim_guard_db" + +# OpenAI +OPENAI_API_KEY="your-openai-api-key-here" +OPENAI_API_MODEL="text-embedding-ada-002" + +# Vector Search +VECTOR_SIMILARITY_THRESHOLD=0.85 + +# App +PORT=3000 +NODE_ENV=development +``` + +## Similarity Threshold Guidelines + +### Production Environment +- **High Precision**: 0.90 - 0.95 (very strict matching) +- **Standard**: 0.85 - 0.90 (recommended for most use cases) +- **Balanced**: 0.80 - 0.85 (good balance between precision and recall) + +### Development Environment +- **Testing**: 0.70 - 0.80 (more lenient for testing) +- **Debugging**: 0.60 - 0.70 (very lenient for development) + +### How to Set Threshold + +#### Via Environment Variable +```bash +export VECTOR_SIMILARITY_THRESHOLD=0.90 +``` + +#### Via .env file +```bash +VECTOR_SIMILARITY_THRESHOLD=0.90 +``` + +#### Via API (Runtime) +```bash +POST /api/pgvector/threshold +{ + "threshold": 0.90 +} +``` + +## Impact of Threshold Changes + +- **Higher Threshold (0.90+)**: Fewer results, higher precision, more relevant matches +- **Lower Threshold (0.70-)**: More results, lower precision, may include less relevant matches +- **Optimal Range (0.80-0.90)**: Good balance between precision and recall for most medical coding use cases diff --git a/docs/SIMILARITY_THRESHOLD.md b/docs/SIMILARITY_THRESHOLD.md new file mode 100644 index 0000000..40303c7 --- /dev/null +++ b/docs/SIMILARITY_THRESHOLD.md @@ -0,0 +1,243 @@ +# Similarity Threshold Configuration + +## Overview + +The similarity threshold feature allows you to control the precision of vector search results by setting a minimum similarity score required for results to be returned. This ensures that only highly relevant matches are included in search results. + +## Default Configuration + +- **Default Threshold**: `0.85` (85% similarity) +- **Environment Variable**: `VECTOR_SIMILARITY_THRESHOLD` +- **Range**: 0.0 to 1.0 (0% to 100% similarity) + +## API Endpoints + +### 1. Get Current Threshold + +```http +GET /api/pgvector/threshold +``` + +**Response:** + +```json +{ + "threshold": 0.85, + "description": "Minimum similarity score required for search results (0.0 - 1.0)" +} +``` + +### 2. Set Threshold + +```http +POST /api/pgvector/threshold +Content-Type: application/json + +{ + "threshold": 0.90 +} +``` + +**Response:** + +```json +{ + "message": "Similarity threshold updated successfully", + "threshold": 0.9, + "previousThreshold": 0.85 +} +``` + +### 3. Advanced Vector Search + +```http +POST /api/pgvector/advanced-search +Content-Type: application/json + +{ + "query": "diabetes mellitus type 2", + "limit": 10, + "category": "ICD10", + "threshold": 0.90 +} +``` + +## Search Methods + +### Standard Vector Search + +- Uses cosine similarity +- Default threshold from environment variable +- Good for general use cases + +### Advanced Vector Search + +- Combines cosine and euclidean similarity metrics +- Weighted scoring: 70% cosine + 30% euclidean +- Higher precision results +- Recommended for production use + +### Hybrid Search + +- Combines vector similarity with text search +- Uses threshold from environment variable +- Best balance of semantic and text matching + +## Threshold Recommendations + +### Medical Coding Use Cases + +| Use Case | Recommended Threshold | Description | +| ---------------------------- | --------------------- | --------------------------------------------- | +| **High Precision Diagnosis** | 0.90 - 0.95 | Very strict matching for critical diagnoses | +| **Standard Medical Coding** | 0.85 - 0.90 | Recommended for most medical coding scenarios | +| **General Medical Search** | 0.80 - 0.85 | Good balance between precision and recall | +| **Research & Exploration** | 0.70 - 0.80 | More lenient for research purposes | + +### Environment-Specific Settings + +#### Production Environment + +```bash +VECTOR_SIMILARITY_THRESHOLD=0.85 +``` + +#### Development Environment + +```bash +VECTOR_SIMILARITY_THRESHOLD=0.70 +``` + +#### Testing Environment + +```bash +VECTOR_SIMILARITY_THRESHOLD=0.75 +``` + +## Implementation Details + +### Environment Variable + +```bash +# Set in .env file +VECTOR_SIMILARITY_THRESHOLD=0.85 + +# Or set as system environment variable +export VECTOR_SIMILARITY_THRESHOLD=0.85 +``` + +### Runtime Configuration + +```typescript +// Get current threshold +const currentThreshold = pgVectorService.getSimilarityThreshold(); + +// Set new threshold +pgVectorService.setSimilarityThreshold(0.9); +``` + +### SQL Query Optimization + +The system automatically optimizes SQL queries to: + +- Filter results at database level using threshold +- Order results by similarity score +- Use appropriate vector similarity operators + +## Performance Impact + +### Higher Threshold (0.90+) + +- ✅ Fewer results to process +- ✅ Higher precision +- ❌ May miss relevant results +- ❌ Slower query execution (more filtering) + +### Lower Threshold (0.70-) + +- ✅ Faster query execution +- ✅ More comprehensive results +- ❌ Lower precision +- ❌ More irrelevant results + +### Optimal Range (0.80-0.90) + +- ✅ Good balance of precision and performance +- ✅ Suitable for most medical coding scenarios +- ✅ Reasonable query execution time + +## Troubleshooting + +### Common Issues + +1. **No Results Returned** + - Check if threshold is too high + - Verify embeddings are generated + - Check database connection + +2. **Too Many Results** + - Increase threshold value + - Use advanced search method + - Add category filters + +3. **Performance Issues** + - Optimize threshold for your use case + - Use database indexes + - Consider batch processing + +### Debug Commands + +```bash +# Check current threshold +curl -X GET http://localhost:3000/api/pgvector/threshold + +# Get embedding statistics +curl -X GET http://localhost:3000/api/pgvector/stats + +# Test with different thresholds +curl -X POST http://localhost:3000/api/pgvector/advanced-search \ + -H "Content-Type: application/json" \ + -d '{"query": "test", "threshold": 0.80}' +``` + +## Best Practices + +1. **Start with Default**: Begin with threshold 0.85 +2. **Test Incrementally**: Adjust threshold in small increments (0.05) +3. **Monitor Results**: Evaluate precision vs. recall trade-offs +4. **Environment Specific**: Use different thresholds for different environments +5. **Document Changes**: Keep track of threshold changes and their impact + +## Migration Guide + +### From Previous Version + +If upgrading from a version without configurable threshold: + +1. **Set Environment Variable**: + + ```bash + VECTOR_SIMILARITY_THRESHOLD=0.85 + ``` + +2. **Update Search Calls**: + + ```typescript + // Old way (hardcoded 0.7) + const results = await service.vectorSearch(query, limit, category, 0.7); + + // New way (uses environment variable) + const results = await service.vectorSearch(query, limit, category); + ``` + +3. **Test New Thresholds**: + + ```bash + # Test with current threshold + curl -X GET http://localhost:3000/api/pgvector/threshold + + # Adjust if needed + curl -X POST http://localhost:3000/api/pgvector/threshold \ + -H "Content-Type: application/json" \ + -d '{"threshold": 0.90}' + ``` diff --git a/package.json b/package.json index cc1871e..ab737f1 100644 --- a/package.json +++ b/package.json @@ -7,7 +7,7 @@ "license": "UNLICENSED", "scripts": { "build": "nest build", - "format": "prettier --write \"src/**/*.ts\" \"test/**/*.ts\"", + "format": "prettier --write \"src/**/*.ts\" \"test/**/*.ts\" \"prisma/seed/**/*.ts\"", "start": "nest start", "start:dev": "nest start --watch", "start:debug": "nest start --debug --watch", @@ -17,7 +17,9 @@ "test:watch": "jest --watch", "test:cov": "jest --coverage", "test:debug": "node --inspect-brk -r tsconfig-paths/register -r ts-node/register node_modules/.bin/jest --runInBand", - "test:e2e": "jest --config ./test/jest-e2e.json" + "test:e2e": "jest --config ./test/jest-e2e.json", + "seed": "ts-node -r tsconfig-paths/register prisma/seed/seed.ts", + "seed:icd": "ts-node -r tsconfig-paths/register prisma/seed/icd/icd.seed.ts" }, "dependencies": { "@langchain/community": "^0.3.53", diff --git a/test/[PUBLIC] ICD-10 e-klaim.xlsx b/prisma/seed/icd/[PUBLIC] ICD-10 e-klaim.xlsx similarity index 100% rename from test/[PUBLIC] ICD-10 e-klaim.xlsx rename to prisma/seed/icd/[PUBLIC] ICD-10 e-klaim.xlsx diff --git a/test/[PUBLIC] ICD-9CM e-klaim.xlsx b/prisma/seed/icd/[PUBLIC] ICD-9CM e-klaim.xlsx similarity index 100% rename from test/[PUBLIC] ICD-9CM e-klaim.xlsx rename to prisma/seed/icd/[PUBLIC] ICD-9CM e-klaim.xlsx diff --git a/prisma/seed/icd/icd.seed.ts b/prisma/seed/icd/icd.seed.ts new file mode 100644 index 0000000..0101bc5 --- /dev/null +++ b/prisma/seed/icd/icd.seed.ts @@ -0,0 +1,170 @@ +import { PrismaClient } from '@prisma/client'; +import * as XLSX from 'xlsx'; +import * as path from 'path'; +import * as fs from 'fs'; + +interface IcdData { + code: string; + display: string; + version: string; +} + +export class IcdSeeder { + private readonly prisma = new PrismaClient(); + + async seed(): Promise<{ + icd9Count: number; + icd10Count: number; + total: number; + }> { + try { + console.log('Starting ICD data import...'); + + // Import ICD-9 data + const icd9Data = this.readExcelFile( + 'prisma/seed/icd/[PUBLIC] ICD-9CM e-klaim.xlsx', + 'ICD9', + ); + + // Import ICD-10 data + const icd10Data = this.readExcelFile( + 'prisma/seed/icd/[PUBLIC] ICD-10 e-klaim.xlsx', + 'ICD10', + ); + + // Clear existing data + await this.prisma.icdCode.deleteMany({}); + console.log('Cleared existing ICD data'); + + // Insert ICD-9 data + const icd9Count = await this.bulkInsertData(icd9Data, 'ICD9'); + console.log(`Imported ${icd9Count} ICD-9 codes`); + + // Insert ICD-10 data + const icd10Count = await this.bulkInsertData(icd10Data, 'ICD10'); + console.log(`Imported ${icd10Count} ICD-10 codes`); + + const total = icd9Count + icd10Count; + console.log(`Total imported: ${total} ICD codes`); + + return { + icd9Count, + icd10Count, + total, + }; + } catch (error) { + console.error('Error importing ICD data:', error); + throw error; + } + } + + private readExcelFile(filePath: string, category: string): IcdData[] { + try { + const fullPath = path.join(process.cwd(), filePath); + + if (!fs.existsSync(fullPath)) { + throw new Error(`File not found: ${fullPath}`); + } + + console.log(`Reading ${category} file: ${filePath}`); + + const workbook = XLSX.readFile(fullPath); + const sheetName = workbook.SheetNames[0]; + const worksheet = workbook.Sheets[sheetName]; + + // Convert sheet to JSON + const jsonData = XLSX.utils.sheet_to_json(worksheet, { header: 1 }); + + // Skip header row and process data + const icdData: IcdData[] = []; + + for (let i = 1; i < jsonData.length; i++) { + const row = jsonData[i] as any[]; + + if (row && row.length >= 3) { + const code = this.cleanString(row[0]); + const display = this.cleanString(row[1]); + const version = this.cleanString(row[2]); + + if (code && display && version) { + icdData.push({ + code, + display, + version, + }); + } + } + } + + console.log(`Found ${icdData.length} valid ${category} records`); + return icdData; + } catch (error) { + console.error(`Error reading ${category} file:`, error); + throw error; + } + } + + private async bulkInsertData( + data: IcdData[], + category: string, + ): Promise { + try { + const batchSize = 1000; + let totalInserted = 0; + + for (let i = 0; i < data.length; i += batchSize) { + const batch = data.slice(i, i + batchSize); + + const insertData = batch.map((item) => ({ + code: item.code, + display: item.display, + version: item.version, + category, + })); + + await this.prisma.icdCode.createMany({ + data: insertData, + skipDuplicates: true, + }); + + totalInserted += batch.length; + console.log( + `Inserted batch ${Math.floor(i / batchSize) + 1} for ${category}: ${batch.length} records`, + ); + } + + return totalInserted; + } catch (error) { + console.error(`Error inserting ${category} data:`, error); + throw error; + } + } + + private cleanString(value: any): string { + if (value === null || value === undefined) { + return ''; + } + return String(value).trim(); + } + + async disconnect() { + await this.prisma.$disconnect(); + } +} + +// Standalone execution +if (require.main === module) { + const seeder = new IcdSeeder(); + seeder + .seed() + .then((result) => { + console.log('ICD seeding completed successfully:', result); + }) + .catch((error) => { + console.error('ICD seeding failed:', error); + process.exit(1); + }) + .finally(() => { + void seeder.disconnect(); + }); +} diff --git a/prisma/seed/seed.ts b/prisma/seed/seed.ts new file mode 100644 index 0000000..41dae1c --- /dev/null +++ b/prisma/seed/seed.ts @@ -0,0 +1,27 @@ +import { PrismaClient } from '@prisma/client'; +import { IcdSeeder } from './icd/icd.seed'; + +const prisma = new PrismaClient(); + +async function main() { + console.log('Starting database seeding...'); + + try { + // Seed ICD data + const icdSeeder = new IcdSeeder(); + const icdResult = await icdSeeder.seed(); + console.log('ICD seeding completed:', icdResult); + await icdSeeder.disconnect(); + + console.log('Database seeding completed successfully!'); + } catch (error) { + console.error('Error during seeding:', error); + throw error; + } +} + +main() + .catch(console.error) + .finally(() => { + void prisma.$disconnect(); + }); diff --git a/src/health/health.controller.ts b/src/health/health.controller.ts index 14152a7..cacbce9 100644 --- a/src/health/health.controller.ts +++ b/src/health/health.controller.ts @@ -64,7 +64,7 @@ export class HealthController { status: 200, description: 'Application is ready', }) - async getReady() { + getReady() { return { status: 'ready' }; } @@ -77,7 +77,7 @@ export class HealthController { status: 200, description: 'Application is alive', }) - async getLive() { + getLive() { return { status: 'alive' }; } } diff --git a/src/icd/icd.controller.ts b/src/icd/icd.controller.ts index 24fada4..d0e0e8d 100644 --- a/src/icd/icd.controller.ts +++ b/src/icd/icd.controller.ts @@ -1,4 +1,4 @@ -import { Controller, Get, Post, Query, Logger } from '@nestjs/common'; +import { Controller, Get, Query, Logger } from '@nestjs/common'; import { ApiTags, ApiOperation, @@ -8,10 +8,8 @@ import { ApiInternalServerErrorResponse, } from '@nestjs/swagger'; import { IcdService } from './icd.service'; -import { SearchIcdDto } from './dto/search-icd.dto'; import { IcdSearchResponseDto, - IcdImportResponseDto, IcdStatisticsResponseDto, ErrorResponseDto, } from './dto/icd-response.dto'; @@ -23,40 +21,6 @@ export class IcdController { constructor(private readonly icdService: IcdService) {} - @Post('import') - @ApiOperation({ - summary: 'Import ICD data from Excel files', - description: - 'Import ICD-9 and ICD-10 codes from Excel files located in the test directory. This operation will process both ICD files and insert/update the database with the latest codes.', - }) - @ApiResponse({ - status: 200, - description: 'ICD data imported successfully', - type: IcdImportResponseDto, - }) - @ApiBadRequestResponse({ - description: 'Bad request - Invalid file format or missing files', - type: ErrorResponseDto, - }) - @ApiInternalServerErrorResponse({ - description: 'Internal server error during import process', - type: ErrorResponseDto, - }) - async importData(): Promise { - try { - this.logger.log('Starting ICD data import...'); - const result = await this.icdService.importIcdData(); - return { - success: true, - message: 'ICD data imported successfully', - data: result, - }; - } catch (error) { - this.logger.error('Error importing ICD data:', error); - throw error; - } - } - @Get('search') @ApiOperation({ summary: 'Search ICD codes with filters and pagination', diff --git a/src/icd/icd.service.ts b/src/icd/icd.service.ts index 23ed54a..8eafe4b 100644 --- a/src/icd/icd.service.ts +++ b/src/icd/icd.service.ts @@ -1,158 +1,11 @@ import { Injectable, Logger } from '@nestjs/common'; -import { PrismaClient } from '../../generated/prisma'; -import * as XLSX from 'xlsx'; -import * as path from 'path'; -import * as fs from 'fs'; - -interface IcdData { - code: string; - display: string; - version: string; -} +import { PrismaClient } from '@prisma/client'; @Injectable() export class IcdService { private readonly logger = new Logger(IcdService.name); private readonly prisma = new PrismaClient(); - async importIcdData(): Promise<{ - icd9Count: number; - icd10Count: number; - total: number; - }> { - try { - this.logger.log('Starting ICD data import...'); - - // Import ICD-9 data - const icd9Data = await this.readExcelFile( - 'test/[PUBLIC] ICD-9CM e-klaim.xlsx', - 'ICD9', - ); - - // Import ICD-10 data - const icd10Data = await this.readExcelFile( - 'test/[PUBLIC] ICD-10 e-klaim.xlsx', - 'ICD10', - ); - - // Clear existing data - await this.prisma.icdCode.deleteMany({}); - this.logger.log('Cleared existing ICD data'); - - // Insert ICD-9 data - const icd9Count = await this.bulkInsertData(icd9Data, 'ICD9'); - this.logger.log(`Imported ${icd9Count} ICD-9 codes`); - - // Insert ICD-10 data - const icd10Count = await this.bulkInsertData(icd10Data, 'ICD10'); - this.logger.log(`Imported ${icd10Count} ICD-10 codes`); - - const total = icd9Count + icd10Count; - this.logger.log(`Total imported: ${total} ICD codes`); - - return { - icd9Count, - icd10Count, - total, - }; - } catch (error) { - this.logger.error('Error importing ICD data:', error); - throw error; - } - } - - private async readExcelFile( - filePath: string, - category: string, - ): Promise { - try { - const fullPath = path.join(process.cwd(), filePath); - - if (!fs.existsSync(fullPath)) { - throw new Error(`File not found: ${fullPath}`); - } - - this.logger.log(`Reading ${category} file: ${filePath}`); - - const workbook = XLSX.readFile(fullPath); - const sheetName = workbook.SheetNames[0]; - const worksheet = workbook.Sheets[sheetName]; - - // Convert sheet to JSON - const jsonData = XLSX.utils.sheet_to_json(worksheet, { header: 1 }); - - // Skip header row and process data - const icdData: IcdData[] = []; - - for (let i = 1; i < jsonData.length; i++) { - const row = jsonData[i] as any[]; - - if (row && row.length >= 3) { - const code = this.cleanString(row[0]); - const display = this.cleanString(row[1]); - const version = this.cleanString(row[2]); - - if (code && display && version) { - icdData.push({ - code, - display, - version, - }); - } - } - } - - this.logger.log(`Found ${icdData.length} valid ${category} records`); - return icdData; - } catch (error) { - this.logger.error(`Error reading ${category} file:`, error); - throw error; - } - } - - private async bulkInsertData( - data: IcdData[], - category: string, - ): Promise { - try { - const batchSize = 1000; - let totalInserted = 0; - - for (let i = 0; i < data.length; i += batchSize) { - const batch = data.slice(i, i + batchSize); - - const insertData = batch.map((item) => ({ - code: item.code, - display: item.display, - version: item.version, - category, - })); - - await this.prisma.icdCode.createMany({ - data: insertData, - skipDuplicates: true, - }); - - totalInserted += batch.length; - this.logger.log( - `Inserted batch ${Math.floor(i / batchSize) + 1} for ${category}: ${batch.length} records`, - ); - } - - return totalInserted; - } catch (error) { - this.logger.error(`Error inserting ${category} data:`, error); - throw error; - } - } - - private cleanString(value: any): string { - if (value === null || value === undefined) { - return ''; - } - return String(value).trim(); - } - async findIcdCodes( category?: string, search?: string, diff --git a/src/icd/pgvector.controller.ts b/src/icd/pgvector.controller.ts index 4eefd30..83ec77a 100644 --- a/src/icd/pgvector.controller.ts +++ b/src/icd/pgvector.controller.ts @@ -8,6 +8,15 @@ import { ValidationPipe, UsePipes, } from '@nestjs/common'; +import { + IsString, + IsOptional, + IsNumber, + IsEnum, + Min, + Max, + IsNotEmpty, +} from 'class-validator'; import { ApiTags, ApiOperation, @@ -27,6 +36,8 @@ export class VectorSearchDto { minLength: 1, maxLength: 500, }) + @IsString() + @IsNotEmpty() query: string; @ApiProperty({ @@ -37,6 +48,10 @@ export class VectorSearchDto { maximum: 100, default: 10, }) + @IsOptional() + @IsNumber() + @Min(1) + @Max(100) limit?: number; @ApiProperty({ @@ -46,16 +61,22 @@ export class VectorSearchDto { enum: ['ICD9', 'ICD10'], default: undefined, }) + @IsOptional() + @IsEnum(['ICD9', 'ICD10']) category?: string; @ApiProperty({ description: 'Similarity threshold (0.0 - 1.0) for filtering results', - example: 0.7, + example: 0.85, required: false, minimum: 0.0, maximum: 1.0, - default: 0.7, + default: 0.85, }) + @IsOptional() + @IsNumber() + @Min(0.0) + @Max(1.0) threshold?: number; } @@ -66,6 +87,8 @@ export class EmbeddingRequestDto { minLength: 1, maxLength: 1000, }) + @IsString() + @IsNotEmpty() text: string; @ApiProperty({ @@ -74,9 +97,24 @@ export class EmbeddingRequestDto { required: false, default: 'text-embedding-ada-002', }) + @IsOptional() + @IsString() model?: string; } +export class ThresholdConfigDto { + @ApiProperty({ + description: 'Similarity threshold value (0.0 - 1.0)', + example: 0.85, + minimum: 0.0, + maximum: 1.0, + }) + @IsNumber() + @Min(0.0) + @Max(1.0) + threshold: number; +} + export class VectorSearchResponseDto { @ApiProperty({ description: 'Array of search results with similarity scores', @@ -486,6 +524,61 @@ export class PgVectorController { }; } + @Post('advanced-search') + @ApiOperation({ + summary: 'Advanced vector similarity search', + description: + 'Advanced vector search using multiple similarity metrics (cosine + euclidean) for more accurate results with higher threshold.', + tags: ['PgVector Operations'], + }) + @ApiConsumes('application/json') + @ApiProduces('application/json') + @ApiBody({ + type: VectorSearchDto, + description: 'Search parameters for advanced vector search', + examples: { + highPrecision: { + summary: 'High precision search', + value: { + query: 'diabetes mellitus type 2', + limit: 10, + category: 'ICD10', + threshold: 0.9, + }, + }, + }, + }) + @ApiResponse({ + status: HttpStatus.OK, + description: + 'Advanced vector search results with enhanced similarity scores', + type: VectorSearchResponseDto, + }) + @ApiResponse({ + status: HttpStatus.BAD_REQUEST, + description: 'Invalid search parameters', + }) + @ApiResponse({ + status: HttpStatus.INTERNAL_SERVER_ERROR, + description: 'Error during advanced vector search', + }) + async advancedVectorSearch( + @Body() searchDto: VectorSearchDto, + ): Promise { + const results = await this.pgVectorService.advancedVectorSearch( + searchDto.query, + searchDto.limit || 10, + searchDto.category, + searchDto.threshold, + ); + + return { + data: results, + total: results.length, + query: searchDto.query, + }; + } + @Post('generate-embedding') @ApiOperation({ summary: 'Generate text embedding', @@ -570,6 +663,50 @@ export class PgVectorController { }; } + @Post('regenerate-embeddings-enhanced') + @ApiOperation({ + summary: 'Regenerate embeddings with enhanced text representation', + description: + 'Regenerate existing embeddings using enhanced text representation for better similarity scores. This improves search quality.', + tags: ['PgVector Operations'], + }) + @ApiConsumes('application/json') + @ApiProduces('application/json') + @ApiResponse({ + status: HttpStatus.OK, + description: 'Enhanced embedding regeneration results summary', + schema: { + type: 'object', + properties: { + processed: { type: 'number', example: 100 }, + errors: { type: 'number', example: 0 }, + totalSample: { type: 'number', example: 100 }, + message: { + type: 'string', + example: 'Enhanced embeddings regenerated successfully', + }, + }, + }, + }) + @ApiResponse({ + status: HttpStatus.INTERNAL_SERVER_ERROR, + description: 'Error during enhanced embedding regeneration', + }) + async regenerateEmbeddingsEnhanced(): Promise<{ + processed: number; + errors: number; + totalSample: number; + message: string; + }> { + const result = + await this.pgVectorService.regenerateEmbeddingsWithEnhancedText(); + + return { + ...result, + message: `Enhanced embeddings regenerated successfully. Processed: ${result.processed}, Errors: ${result.errors}`, + }; + } + @Get('stats') @ApiOperation({ summary: 'Get embedding statistics', @@ -640,6 +777,234 @@ export class PgVectorController { }; } + @Get('threshold') + @ApiOperation({ + summary: 'Get current similarity threshold', + description: + 'Get the current similarity threshold configuration used for vector search filtering.', + tags: ['PgVector Operations'], + }) + @ApiProduces('application/json') + @ApiResponse({ + status: HttpStatus.OK, + description: 'Current similarity threshold configuration', + schema: { + type: 'object', + properties: { + threshold: { + type: 'number', + description: 'Current similarity threshold value', + example: 0.85, + }, + description: { + type: 'string', + description: 'Description of the threshold setting', + example: 'Minimum similarity score required for search results', + }, + }, + }, + }) + @ApiResponse({ + status: HttpStatus.INTERNAL_SERVER_ERROR, + description: 'Error retrieving threshold configuration', + }) + async getSimilarityThreshold(): Promise<{ + threshold: number; + description: string; + }> { + const threshold = this.pgVectorService.getSimilarityThreshold(); + return { + threshold, + description: + 'Minimum similarity score required for search results (0.0 - 1.0)', + }; + } + + @Get('model') + @ApiOperation({ + summary: 'Get current embedding model', + description: + 'Get the current OpenAI embedding model configuration used for vector generation.', + tags: ['PgVector Operations'], + }) + @ApiProduces('application/json') + @ApiResponse({ + status: HttpStatus.OK, + description: 'Current embedding model configuration', + schema: { + type: 'object', + properties: { + model: { + type: 'string', + description: 'Current embedding model name', + example: 'text-embedding-ada-002', + }, + description: { + type: 'string', + description: 'Description of the model configuration', + example: 'OpenAI embedding model for vector generation', + }, + source: { + type: 'string', + description: 'Source of the model configuration', + example: 'Environment Variable', + }, + }, + }, + }) + @ApiResponse({ + status: HttpStatus.INTERNAL_SERVER_ERROR, + description: 'Error retrieving model configuration', + }) + async getEmbeddingModel(): Promise<{ + model: string; + description: string; + source: string; + }> { + const model = this.pgVectorService.getEmbeddingModel(); + const source = process.env.OPENAI_API_MODEL + ? 'Environment Variable' + : 'Default'; + + return { + model, + description: 'OpenAI embedding model for vector generation', + source, + }; + } + + @Post('threshold') + @ApiOperation({ + summary: 'Set similarity threshold', + description: + 'Set the similarity threshold for vector search filtering. Higher values result in more strict matching.', + tags: ['PgVector Operations'], + }) + @ApiConsumes('application/json') + @ApiProduces('application/json') + @ApiBody({ + type: ThresholdConfigDto, + description: 'Threshold configuration parameters', + }) + @ApiResponse({ + status: HttpStatus.OK, + description: 'Similarity threshold updated successfully', + schema: { + type: 'object', + properties: { + message: { + type: 'string', + description: 'Success message', + example: 'Similarity threshold updated successfully', + }, + threshold: { + type: 'number', + description: 'Updated threshold value', + example: 0.9, + }, + previousThreshold: { + type: 'number', + description: 'Previous threshold value', + example: 0.85, + }, + }, + }, + }) + @ApiResponse({ + status: HttpStatus.BAD_REQUEST, + description: 'Invalid threshold value (must be between 0.0 and 1.0)', + }) + @ApiResponse({ + status: HttpStatus.INTERNAL_SERVER_ERROR, + description: 'Error updating threshold configuration', + }) + async setSimilarityThreshold( + @Body() thresholdConfig: ThresholdConfigDto, + ): Promise<{ + message: string; + threshold: number; + previousThreshold: number; + }> { + const previousThreshold = this.pgVectorService.getSimilarityThreshold(); + this.pgVectorService.setSimilarityThreshold(thresholdConfig.threshold); + + return { + message: 'Similarity threshold updated successfully', + threshold: thresholdConfig.threshold, + previousThreshold, + }; + } + + @Post('model') + @ApiOperation({ + summary: 'Set embedding model', + description: + 'Set the OpenAI embedding model for vector generation. This will reinitialize the embeddings service.', + tags: ['PgVector Operations'], + }) + @ApiConsumes('application/json') + @ApiProduces('application/json') + @ApiBody({ + schema: { + type: 'object', + properties: { + model: { + type: 'string', + description: 'OpenAI embedding model name', + example: 'text-embedding-ada-002', + }, + }, + required: ['model'], + }, + description: 'Model configuration parameters', + }) + @ApiResponse({ + status: HttpStatus.OK, + description: 'Embedding model updated successfully', + schema: { + type: 'object', + properties: { + message: { + type: 'string', + description: 'Success message', + example: 'Embedding model updated successfully', + }, + model: { + type: 'string', + description: 'Updated model name', + example: 'text-embedding-ada-002', + }, + previousModel: { + type: 'string', + description: 'Previous model name', + example: 'text-embedding-ada-002', + }, + }, + }, + }) + @ApiResponse({ + status: HttpStatus.BAD_REQUEST, + description: 'Invalid model name', + }) + @ApiResponse({ + status: HttpStatus.INTERNAL_SERVER_ERROR, + description: 'Error updating model configuration', + }) + async setEmbeddingModel(@Body() body: { model: string }): Promise<{ + message: string; + model: string; + previousModel: string; + }> { + const previousModel = this.pgVectorService.getEmbeddingModel(); + await this.pgVectorService.setEmbeddingModel(body.model); + + return { + message: 'Embedding model updated successfully', + model: body.model, + previousModel, + }; + } + @Post('refresh') @ApiOperation({ summary: 'Refresh pgvector store', diff --git a/src/icd/pgvector.service.ts b/src/icd/pgvector.service.ts index 0df4e8e..0d41af5 100644 --- a/src/icd/pgvector.service.ts +++ b/src/icd/pgvector.service.ts @@ -1,8 +1,7 @@ import { Injectable, Logger } from '@nestjs/common'; -import { PrismaClient } from '../../generated/prisma'; +import { PrismaClient } from '@prisma/client'; import { OpenAIEmbeddings } from '@langchain/openai'; import { PGVectorStore } from '@langchain/community/vectorstores/pgvector'; -import { Document } from 'langchain/document'; import { Pool } from 'pg'; export interface VectorSearchResult { @@ -72,6 +71,41 @@ export class PgVectorService { } } + /** + * Reinitialize OpenAI embeddings with new model + */ + private async reinitializeEmbeddings(modelName: string): Promise { + try { + const apiKey = process.env.OPENAI_API_KEY; + if (!apiKey) { + throw new Error('OPENAI_API_KEY not found'); + } + + this.logger.log( + `Reinitializing OpenAI embeddings with model: ${modelName}`, + ); + + // Create new embeddings instance with new model + this.embeddings = new OpenAIEmbeddings({ + openAIApiKey: apiKey, + modelName: modelName, + maxConcurrency: 5, + }); + + // Update environment variable to reflect current model + process.env.OPENAI_API_MODEL = modelName; + + this.logger.log( + `OpenAI embeddings reinitialized successfully with model: ${modelName}`, + ); + } catch (error) { + this.logger.error('Failed to reinitialize OpenAI embeddings:', error); + throw new Error( + `Failed to reinitialize OpenAI embeddings: ${error.message}`, + ); + } + } + /** * Initialize pgvector store dengan LangChain */ @@ -115,25 +149,35 @@ export class PgVectorService { /** * Generate embedding untuk text menggunakan OpenAI */ - async generateEmbedding( - text: string, - model: string = 'text-embedding-ada-002', - ): Promise { + async generateEmbedding(text: string, model?: string): Promise { try { + // Get model from parameter, environment variable, or use default + const embeddingModel = + model || process.env.OPENAI_API_MODEL || 'text-embedding-ada-002'; + this.logger.log( - `Generating embedding for text: ${text.substring(0, 100)}...`, + `Generating embedding for text: ${text.substring(0, 100)}... using model: ${embeddingModel}`, ); + // Check if we need to reinitialize embeddings with new model + const currentModel = this.getEmbeddingModel(); + if (model && model !== currentModel) { + this.logger.log( + `Switching embedding model from ${currentModel} to ${model}`, + ); + await this.reinitializeEmbeddings(model); + } + if (!this.embeddings) { throw new Error( 'OpenAI embeddings not initialized. Please check your API configuration.', ); } - // Use OpenAI embeddings + // Use OpenAI embeddings with current model const embedding = await this.embeddings.embedQuery(text); this.logger.log( - `Generated OpenAI embedding with ${embedding.length} dimensions`, + `Generated OpenAI embedding with ${embedding.length} dimensions using model: ${this.getEmbeddingModel()}`, ); return embedding; } catch (error) { @@ -191,7 +235,8 @@ export class PgVectorService { `UPDATE icd_codes SET embedding = $1::vector, metadata = $2::jsonb, - content = $3 + content = $3, + "updatedAt" = NOW() WHERE id = $4`, [ vectorString, @@ -289,7 +334,8 @@ export class PgVectorService { `UPDATE icd_codes SET embedding = $1::vector, metadata = $2::jsonb, - content = $3 + content = $3, + "updatedAt" = NOW() WHERE id = $4`, [ vectorString, @@ -337,16 +383,23 @@ export class PgVectorService { } /** - * Vector similarity search menggunakan pgvector + * Vector similarity search menggunakan pgvector dengan threshold yang dapat dikonfigurasi */ async vectorSearch( query: string, limit: number = 10, category?: string, - threshold: number = 0.7, + threshold?: number, ): Promise { + // Get threshold from environment variable or use default + const defaultThreshold = parseFloat( + process.env.VECTOR_SIMILARITY_THRESHOLD || '0.85', + ); + const similarityThreshold = threshold || defaultThreshold; try { - this.logger.log(`Performing pgvector search for: ${query}`); + this.logger.log( + `Performing pgvector search for: ${query} with threshold: ${similarityThreshold}`, + ); if (!this.embeddings) { throw new Error('OpenAI embeddings not initialized'); @@ -358,17 +411,19 @@ export class PgVectorService { // Convert embedding array to proper vector format for pgvector const vectorString = `[${queryEmbedding.join(',')}]`; - // Build SQL query for vector similarity search + // Build SQL query for vector similarity search with higher precision + // Using cosine distance and converting to similarity score let sql = ` SELECT id, code, display, version, category, - 1 - (embedding <=> $1::vector) as similarity + (1 - (embedding <=> $1::vector)) as similarity FROM icd_codes WHERE embedding IS NOT NULL + AND (1 - (embedding <=> $1::vector)) >= $2 `; - const params: any[] = [vectorString]; - let paramIndex = 2; + const params: any[] = [vectorString, similarityThreshold]; + let paramIndex = 3; if (category) { sql += ` AND category = $${paramIndex}`; @@ -376,23 +431,24 @@ export class PgVectorService { paramIndex++; } - sql += ` ORDER BY embedding <=> $1::vector ASC LIMIT $${paramIndex}`; + // Order by similarity descending and limit results + sql += ` ORDER BY similarity DESC LIMIT $${paramIndex}`; params.push(limit); // Execute raw SQL query const result = await this.pool.query(sql, params); - // Transform and filter results - const filteredResults: VectorSearchResult[] = result.rows - .filter((row: any) => row.similarity >= threshold) - .map((row: any) => ({ + // Transform results (no need to filter again since SQL already filters) + const filteredResults: VectorSearchResult[] = result.rows.map( + (row: any) => ({ id: row.id, code: row.code, display: row.display, version: row.version, category: row.category, similarity: parseFloat(row.similarity), - })); + }), + ); this.logger.log( `Pgvector search returned ${filteredResults.length} results for query: "${query}"`, @@ -417,12 +473,12 @@ export class PgVectorService { try { this.logger.log(`Performing hybrid search for: ${query}`); - // Get vector search results + // Get vector search results with higher threshold const vectorResults = await this.vectorSearch( query, limit * 2, category, - 0.5, + parseFloat(process.env.VECTOR_SIMILARITY_THRESHOLD || '0.85'), ); // Get text search results @@ -475,7 +531,7 @@ export class PgVectorService { try { let sql = 'SELECT id, code, display, version, category FROM icd_codes'; const params: any[] = []; - let whereConditions: string[] = []; + const whereConditions: string[] = []; let paramIndex = 1; if (category) { @@ -515,6 +571,51 @@ export class PgVectorService { } } + /** + * Get current similarity threshold configuration + */ + getSimilarityThreshold(): number { + return parseFloat(process.env.VECTOR_SIMILARITY_THRESHOLD || '0.85'); + } + + /** + * Get current embedding model configuration + */ + getEmbeddingModel(): string { + return process.env.OPENAI_API_MODEL || 'text-embedding-ada-002'; + } + + /** + * Set similarity threshold (for runtime configuration) + */ + setSimilarityThreshold(threshold: number): void { + if (threshold < 0 || threshold > 1) { + throw new Error('Similarity threshold must be between 0 and 1'); + } + process.env.VECTOR_SIMILARITY_THRESHOLD = threshold.toString(); + this.logger.log(`Similarity threshold updated to: ${threshold}`); + } + + /** + * Set embedding model (for runtime configuration) + */ + async setEmbeddingModel(modelName: string): Promise { + if (!modelName || typeof modelName !== 'string') { + throw new Error('Model name must be a valid string'); + } + + const currentModel = this.getEmbeddingModel(); + if (modelName === currentModel) { + this.logger.log(`Model ${modelName} is already active`); + return; + } + + this.logger.log( + `Switching embedding model from ${currentModel} to ${modelName}`, + ); + await this.reinitializeEmbeddings(modelName); + } + /** * Get embedding statistics */ @@ -524,6 +625,7 @@ export class PgVectorService { withoutEmbeddings: number; percentage: number; vectorStoreStatus: string; + currentThreshold: number; }> { try { // Use raw SQL to get embedding statistics @@ -548,6 +650,7 @@ export class PgVectorService { withoutEmbeddings, percentage: Math.round(percentage * 100) / 100, vectorStoreStatus, + currentThreshold: this.getSimilarityThreshold(), }; } catch (error) { this.logger.error('Error getting embedding stats:', error); @@ -569,6 +672,95 @@ export class PgVectorService { } } + /** + * Advanced vector search dengan multiple similarity metrics untuk mendapatkan hasil yang lebih akurat + */ + async advancedVectorSearch( + query: string, + limit: number = 10, + category?: string, + threshold?: number, + ): Promise { + try { + // Get threshold from environment variable or use default + const defaultThreshold = parseFloat( + process.env.VECTOR_SIMILARITY_THRESHOLD || '0.85', + ); + const similarityThreshold = threshold || defaultThreshold; + + this.logger.log( + `Performing advanced vector search for: ${query} with threshold: ${similarityThreshold}`, + ); + + if (!this.embeddings) { + throw new Error('OpenAI embeddings not initialized'); + } + + // Generate embedding for query + const queryEmbedding = await this.generateEmbedding(query); + const vectorString = `[${queryEmbedding.join(',')}]`; + + // Advanced SQL query using multiple similarity metrics + let sql = ` + SELECT + id, code, display, version, category, + (1 - (embedding <=> $1::vector)) as cosine_similarity, + (1 - (embedding <-> $1::vector)) as euclidean_similarity, + (embedding <#> $1::vector) as negative_inner_product + FROM icd_codes + WHERE embedding IS NOT NULL + `; + + const params: any[] = [vectorString]; + let paramIndex = 2; + + if (category) { + sql += ` AND category = $${paramIndex}`; + params.push(category); + paramIndex++; + } + + // Filter by cosine similarity threshold + sql += ` AND (1 - (embedding <=> $1::vector)) >= $${paramIndex}`; + params.push(similarityThreshold); + paramIndex++; + + // Order by combined similarity score and limit + sql += ` ORDER BY cosine_similarity DESC, euclidean_similarity DESC LIMIT $${paramIndex}`; + params.push(limit); + + const result = await this.pool.query(sql, params); + + // Transform results with enhanced similarity scoring + const filteredResults: VectorSearchResult[] = result.rows.map( + (row: any) => { + const cosineSim = parseFloat(row.cosine_similarity); + const euclideanSim = parseFloat(row.euclidean_similarity); + + // Calculate combined similarity score (weighted average) + const combinedSimilarity = cosineSim * 0.7 + euclideanSim * 0.3; + + return { + id: row.id, + code: row.code, + display: row.display, + version: row.version, + category: row.category, + similarity: Math.round(combinedSimilarity * 1000) / 1000, // Round to 3 decimal places + }; + }, + ); + + this.logger.log( + `Advanced vector search returned ${filteredResults.length} results for query: "${query}" with threshold: ${similarityThreshold}`, + ); + return filteredResults; + } catch (error) { + this.logger.error('Error in advanced vector search:', error); + throw error; + } + } + /** * Get vector store status */ @@ -589,9 +781,10 @@ export class PgVectorService { initialized: !!this.vectorStore, documentCount, embeddingModel: this.embeddings - ? `OpenAI ${process.env.OPENAI_API_MODEL || 'text-embedding-ada-002'}` + ? `OpenAI ${this.getEmbeddingModel()}` : 'Not Available', lastUpdated: new Date(), + currentThreshold: this.getSimilarityThreshold(), }; return status; @@ -601,6 +794,164 @@ export class PgVectorService { } } + /** + * Create enhanced text representation for better embedding quality + */ + private createEnhancedTextRepresentation(code: any): string { + // Base text with code and display + let text = `${code.code} ${code.display}`; + + // Add category context + if (code.category === 'ICD9') { + text += ` ICD-9 CM procedure diagnosis`; + } else if (code.category === 'ICD10') { + text += ` ICD-10 diagnosis condition`; + } + + // Add version context + if (code.version) { + text += ` ${code.version}`; + } + + // Add medical context based on display content + const display = code.display.toLowerCase(); + + // Add procedure context + if ( + display.includes('procedure') || + display.includes('surgery') || + display.includes('operation') + ) { + text += ' medical procedure surgical intervention'; + } + + // Add diagnosis context + if ( + display.includes('diagnosis') || + display.includes('condition') || + display.includes('disease') + ) { + text += ' medical diagnosis clinical condition'; + } + + // Add anatomical context + if ( + display.includes('cranial') || + display.includes('brain') || + display.includes('head') + ) { + text += ' neurological cranial brain head'; + } + + if ( + display.includes('cardiac') || + display.includes('heart') || + display.includes('cardiovascular') + ) { + text += ' cardiac heart cardiovascular'; + } + + if ( + display.includes('pulmonary') || + display.includes('lung') || + display.includes('respiratory') + ) { + text += ' pulmonary lung respiratory'; + } + + // Add common medical terms + text += ' medical healthcare clinical'; + + return text; + } + + /** + * Regenerate embeddings with enhanced text representation for better similarity + */ + async regenerateEmbeddingsWithEnhancedText(limit: number = 100): Promise<{ + processed: number; + errors: number; + totalSample: number; + }> { + try { + this.logger.log( + `Starting enhanced embedding regeneration for ${limit} ICD codes...`, + ); + + // Get ICD codes with existing embeddings to regenerate + const codesWithEmbeddings = await this.pool.query( + 'SELECT id, code, display, version, category FROM icd_codes WHERE embedding IS NOT NULL LIMIT $1', + [limit], + ); + + if (codesWithEmbeddings.rows.length === 0) { + this.logger.log('No ICD codes found with embeddings to regenerate'); + return { processed: 0, errors: 0, totalSample: 0 }; + } + + this.logger.log( + `Found ${codesWithEmbeddings.rows.length} codes to regenerate with enhanced text`, + ); + + let processed = 0; + let errors = 0; + + // Process each code + for (let i = 0; i < codesWithEmbeddings.rows.length; i++) { + const code = codesWithEmbeddings.rows[i]; + try { + // Create enhanced text representation for better embedding quality + const text = this.createEnhancedTextRepresentation(code); + + // Generate new embedding + const embedding = await this.generateEmbedding(text); + + // Convert embedding array to proper vector format for pgvector + const vectorString = `[${embedding.join(',')}]`; + + // Update database with new embedding and enhanced content + await this.pool.query( + `UPDATE icd_codes + SET embedding = $1::vector, + content = $2, + "updatedAt" = NOW() + WHERE id = $3`, + [vectorString, text, code.id], + ); + + processed++; + + if (processed % 10 === 0) { + this.logger.log( + `Regenerated ${processed}/${codesWithEmbeddings.rows.length} enhanced embeddings`, + ); + } + } catch (error) { + this.logger.error( + `Error regenerating embedding for code ${code.code}:`, + error, + ); + errors++; + } + } + + this.logger.log( + `Enhanced embedding regeneration completed. Processed: ${processed}, Errors: ${errors}, Total: ${codesWithEmbeddings.rows.length}`, + ); + return { + processed, + errors, + totalSample: codesWithEmbeddings.rows.length, + }; + } catch (error) { + this.logger.error( + 'Error in regenerateEmbeddingsWithEnhancedText:', + error, + ); + throw error; + } + } + /** * Cleanup resources */