try fix similaruuty and add seed for master excel icd
This commit is contained in:
@@ -176,4 +176,6 @@ METRICS_PATH=/metrics
|
||||
HEALTH_CHECK_PATH=/health
|
||||
|
||||
OPENAI_API_KEY=xxxxxx
|
||||
OPENAI_API_MODEL=text-embedding-ada-002
|
||||
OPENAI_API_MODEL=text-embedding-ada-002
|
||||
|
||||
VECTOR_SIMILARITY_THRESHOLD=0.85
|
||||
@@ -1,7 +1,8 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
# PostgreSQL Database with pgvector extension
|
||||
postgres:
|
||||
image: pgvector/pgvector:pg15
|
||||
image: pgvector/pgvector:pg17
|
||||
container_name: claim-guard-postgres
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
@@ -9,7 +10,7 @@ services:
|
||||
POSTGRES_USER: postgres
|
||||
POSTGRES_PASSWORD: postgres123
|
||||
ports:
|
||||
- '5432:5432'
|
||||
- '5433:5432' # host:container → akses dari host pakai port 5433
|
||||
volumes:
|
||||
- postgres_data:/var/lib/postgresql/data
|
||||
- ./docker/postgres/init:/docker-entrypoint-initdb.d
|
||||
|
||||
73
docs/ENVIRONMENT_VARIABLES.md
Normal file
73
docs/ENVIRONMENT_VARIABLES.md
Normal file
@@ -0,0 +1,73 @@
|
||||
# Environment Variables
|
||||
|
||||
## Database Configuration
|
||||
- `DATABASE_URL`: PostgreSQL connection string
|
||||
- Example: `postgresql://username:password@localhost:5432/claim_guard_db`
|
||||
|
||||
## OpenAI Configuration
|
||||
- `OPENAI_API_KEY`: Your OpenAI API key for embeddings
|
||||
- `OPENAI_API_MODEL`: OpenAI model for embeddings (default: `text-embedding-ada-002`)
|
||||
|
||||
## Vector Search Configuration
|
||||
- `VECTOR_SIMILARITY_THRESHOLD`: Minimum similarity threshold for vector search (default: `0.85`)
|
||||
- Range: 0.0 to 1.0
|
||||
- Higher values = more strict matching
|
||||
- Recommended: 0.85 for production, 0.7 for development
|
||||
|
||||
## Application Configuration
|
||||
- `PORT`: Application port (default: 3000)
|
||||
- `NODE_ENV`: Environment mode (development/production)
|
||||
|
||||
## Example .env file
|
||||
```bash
|
||||
# Database
|
||||
DATABASE_URL="postgresql://username:password@localhost:5432/claim_guard_db"
|
||||
|
||||
# OpenAI
|
||||
OPENAI_API_KEY="your-openai-api-key-here"
|
||||
OPENAI_API_MODEL="text-embedding-ada-002"
|
||||
|
||||
# Vector Search
|
||||
VECTOR_SIMILARITY_THRESHOLD=0.85
|
||||
|
||||
# App
|
||||
PORT=3000
|
||||
NODE_ENV=development
|
||||
```
|
||||
|
||||
## Similarity Threshold Guidelines
|
||||
|
||||
### Production Environment
|
||||
- **High Precision**: 0.90 - 0.95 (very strict matching)
|
||||
- **Standard**: 0.85 - 0.90 (recommended for most use cases)
|
||||
- **Balanced**: 0.80 - 0.85 (good balance between precision and recall)
|
||||
|
||||
### Development Environment
|
||||
- **Testing**: 0.70 - 0.80 (more lenient for testing)
|
||||
- **Debugging**: 0.60 - 0.70 (very lenient for development)
|
||||
|
||||
### How to Set Threshold
|
||||
|
||||
#### Via Environment Variable
|
||||
```bash
|
||||
export VECTOR_SIMILARITY_THRESHOLD=0.90
|
||||
```
|
||||
|
||||
#### Via .env file
|
||||
```bash
|
||||
VECTOR_SIMILARITY_THRESHOLD=0.90
|
||||
```
|
||||
|
||||
#### Via API (Runtime)
|
||||
```bash
|
||||
POST /api/pgvector/threshold
|
||||
{
|
||||
"threshold": 0.90
|
||||
}
|
||||
```
|
||||
|
||||
## Impact of Threshold Changes
|
||||
|
||||
- **Higher Threshold (0.90+)**: Fewer results, higher precision, more relevant matches
|
||||
- **Lower Threshold (0.70-)**: More results, lower precision, may include less relevant matches
|
||||
- **Optimal Range (0.80-0.90)**: Good balance between precision and recall for most medical coding use cases
|
||||
243
docs/SIMILARITY_THRESHOLD.md
Normal file
243
docs/SIMILARITY_THRESHOLD.md
Normal file
@@ -0,0 +1,243 @@
|
||||
# Similarity Threshold Configuration
|
||||
|
||||
## Overview
|
||||
|
||||
The similarity threshold feature allows you to control the precision of vector search results by setting a minimum similarity score required for results to be returned. This ensures that only highly relevant matches are included in search results.
|
||||
|
||||
## Default Configuration
|
||||
|
||||
- **Default Threshold**: `0.85` (85% similarity)
|
||||
- **Environment Variable**: `VECTOR_SIMILARITY_THRESHOLD`
|
||||
- **Range**: 0.0 to 1.0 (0% to 100% similarity)
|
||||
|
||||
## API Endpoints
|
||||
|
||||
### 1. Get Current Threshold
|
||||
|
||||
```http
|
||||
GET /api/pgvector/threshold
|
||||
```
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"threshold": 0.85,
|
||||
"description": "Minimum similarity score required for search results (0.0 - 1.0)"
|
||||
}
|
||||
```
|
||||
|
||||
### 2. Set Threshold
|
||||
|
||||
```http
|
||||
POST /api/pgvector/threshold
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"threshold": 0.90
|
||||
}
|
||||
```
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"message": "Similarity threshold updated successfully",
|
||||
"threshold": 0.9,
|
||||
"previousThreshold": 0.85
|
||||
}
|
||||
```
|
||||
|
||||
### 3. Advanced Vector Search
|
||||
|
||||
```http
|
||||
POST /api/pgvector/advanced-search
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"query": "diabetes mellitus type 2",
|
||||
"limit": 10,
|
||||
"category": "ICD10",
|
||||
"threshold": 0.90
|
||||
}
|
||||
```
|
||||
|
||||
## Search Methods
|
||||
|
||||
### Standard Vector Search
|
||||
|
||||
- Uses cosine similarity
|
||||
- Default threshold from environment variable
|
||||
- Good for general use cases
|
||||
|
||||
### Advanced Vector Search
|
||||
|
||||
- Combines cosine and euclidean similarity metrics
|
||||
- Weighted scoring: 70% cosine + 30% euclidean
|
||||
- Higher precision results
|
||||
- Recommended for production use
|
||||
|
||||
### Hybrid Search
|
||||
|
||||
- Combines vector similarity with text search
|
||||
- Uses threshold from environment variable
|
||||
- Best balance of semantic and text matching
|
||||
|
||||
## Threshold Recommendations
|
||||
|
||||
### Medical Coding Use Cases
|
||||
|
||||
| Use Case | Recommended Threshold | Description |
|
||||
| ---------------------------- | --------------------- | --------------------------------------------- |
|
||||
| **High Precision Diagnosis** | 0.90 - 0.95 | Very strict matching for critical diagnoses |
|
||||
| **Standard Medical Coding** | 0.85 - 0.90 | Recommended for most medical coding scenarios |
|
||||
| **General Medical Search** | 0.80 - 0.85 | Good balance between precision and recall |
|
||||
| **Research & Exploration** | 0.70 - 0.80 | More lenient for research purposes |
|
||||
|
||||
### Environment-Specific Settings
|
||||
|
||||
#### Production Environment
|
||||
|
||||
```bash
|
||||
VECTOR_SIMILARITY_THRESHOLD=0.85
|
||||
```
|
||||
|
||||
#### Development Environment
|
||||
|
||||
```bash
|
||||
VECTOR_SIMILARITY_THRESHOLD=0.70
|
||||
```
|
||||
|
||||
#### Testing Environment
|
||||
|
||||
```bash
|
||||
VECTOR_SIMILARITY_THRESHOLD=0.75
|
||||
```
|
||||
|
||||
## Implementation Details
|
||||
|
||||
### Environment Variable
|
||||
|
||||
```bash
|
||||
# Set in .env file
|
||||
VECTOR_SIMILARITY_THRESHOLD=0.85
|
||||
|
||||
# Or set as system environment variable
|
||||
export VECTOR_SIMILARITY_THRESHOLD=0.85
|
||||
```
|
||||
|
||||
### Runtime Configuration
|
||||
|
||||
```typescript
|
||||
// Get current threshold
|
||||
const currentThreshold = pgVectorService.getSimilarityThreshold();
|
||||
|
||||
// Set new threshold
|
||||
pgVectorService.setSimilarityThreshold(0.9);
|
||||
```
|
||||
|
||||
### SQL Query Optimization
|
||||
|
||||
The system automatically optimizes SQL queries to:
|
||||
|
||||
- Filter results at database level using threshold
|
||||
- Order results by similarity score
|
||||
- Use appropriate vector similarity operators
|
||||
|
||||
## Performance Impact
|
||||
|
||||
### Higher Threshold (0.90+)
|
||||
|
||||
- ✅ Fewer results to process
|
||||
- ✅ Higher precision
|
||||
- ❌ May miss relevant results
|
||||
- ❌ Slower query execution (more filtering)
|
||||
|
||||
### Lower Threshold (0.70-)
|
||||
|
||||
- ✅ Faster query execution
|
||||
- ✅ More comprehensive results
|
||||
- ❌ Lower precision
|
||||
- ❌ More irrelevant results
|
||||
|
||||
### Optimal Range (0.80-0.90)
|
||||
|
||||
- ✅ Good balance of precision and performance
|
||||
- ✅ Suitable for most medical coding scenarios
|
||||
- ✅ Reasonable query execution time
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Common Issues
|
||||
|
||||
1. **No Results Returned**
|
||||
- Check if threshold is too high
|
||||
- Verify embeddings are generated
|
||||
- Check database connection
|
||||
|
||||
2. **Too Many Results**
|
||||
- Increase threshold value
|
||||
- Use advanced search method
|
||||
- Add category filters
|
||||
|
||||
3. **Performance Issues**
|
||||
- Optimize threshold for your use case
|
||||
- Use database indexes
|
||||
- Consider batch processing
|
||||
|
||||
### Debug Commands
|
||||
|
||||
```bash
|
||||
# Check current threshold
|
||||
curl -X GET http://localhost:3000/api/pgvector/threshold
|
||||
|
||||
# Get embedding statistics
|
||||
curl -X GET http://localhost:3000/api/pgvector/stats
|
||||
|
||||
# Test with different thresholds
|
||||
curl -X POST http://localhost:3000/api/pgvector/advanced-search \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"query": "test", "threshold": 0.80}'
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Start with Default**: Begin with threshold 0.85
|
||||
2. **Test Incrementally**: Adjust threshold in small increments (0.05)
|
||||
3. **Monitor Results**: Evaluate precision vs. recall trade-offs
|
||||
4. **Environment Specific**: Use different thresholds for different environments
|
||||
5. **Document Changes**: Keep track of threshold changes and their impact
|
||||
|
||||
## Migration Guide
|
||||
|
||||
### From Previous Version
|
||||
|
||||
If upgrading from a version without configurable threshold:
|
||||
|
||||
1. **Set Environment Variable**:
|
||||
|
||||
```bash
|
||||
VECTOR_SIMILARITY_THRESHOLD=0.85
|
||||
```
|
||||
|
||||
2. **Update Search Calls**:
|
||||
|
||||
```typescript
|
||||
// Old way (hardcoded 0.7)
|
||||
const results = await service.vectorSearch(query, limit, category, 0.7);
|
||||
|
||||
// New way (uses environment variable)
|
||||
const results = await service.vectorSearch(query, limit, category);
|
||||
```
|
||||
|
||||
3. **Test New Thresholds**:
|
||||
|
||||
```bash
|
||||
# Test with current threshold
|
||||
curl -X GET http://localhost:3000/api/pgvector/threshold
|
||||
|
||||
# Adjust if needed
|
||||
curl -X POST http://localhost:3000/api/pgvector/threshold \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"threshold": 0.90}'
|
||||
```
|
||||
@@ -7,7 +7,7 @@
|
||||
"license": "UNLICENSED",
|
||||
"scripts": {
|
||||
"build": "nest build",
|
||||
"format": "prettier --write \"src/**/*.ts\" \"test/**/*.ts\"",
|
||||
"format": "prettier --write \"src/**/*.ts\" \"test/**/*.ts\" \"prisma/seed/**/*.ts\"",
|
||||
"start": "nest start",
|
||||
"start:dev": "nest start --watch",
|
||||
"start:debug": "nest start --debug --watch",
|
||||
@@ -17,7 +17,9 @@
|
||||
"test:watch": "jest --watch",
|
||||
"test:cov": "jest --coverage",
|
||||
"test:debug": "node --inspect-brk -r tsconfig-paths/register -r ts-node/register node_modules/.bin/jest --runInBand",
|
||||
"test:e2e": "jest --config ./test/jest-e2e.json"
|
||||
"test:e2e": "jest --config ./test/jest-e2e.json",
|
||||
"seed": "ts-node -r tsconfig-paths/register prisma/seed/seed.ts",
|
||||
"seed:icd": "ts-node -r tsconfig-paths/register prisma/seed/icd/icd.seed.ts"
|
||||
},
|
||||
"dependencies": {
|
||||
"@langchain/community": "^0.3.53",
|
||||
|
||||
170
prisma/seed/icd/icd.seed.ts
Normal file
170
prisma/seed/icd/icd.seed.ts
Normal file
@@ -0,0 +1,170 @@
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import * as XLSX from 'xlsx';
|
||||
import * as path from 'path';
|
||||
import * as fs from 'fs';
|
||||
|
||||
interface IcdData {
|
||||
code: string;
|
||||
display: string;
|
||||
version: string;
|
||||
}
|
||||
|
||||
export class IcdSeeder {
|
||||
private readonly prisma = new PrismaClient();
|
||||
|
||||
async seed(): Promise<{
|
||||
icd9Count: number;
|
||||
icd10Count: number;
|
||||
total: number;
|
||||
}> {
|
||||
try {
|
||||
console.log('Starting ICD data import...');
|
||||
|
||||
// Import ICD-9 data
|
||||
const icd9Data = this.readExcelFile(
|
||||
'prisma/seed/icd/[PUBLIC] ICD-9CM e-klaim.xlsx',
|
||||
'ICD9',
|
||||
);
|
||||
|
||||
// Import ICD-10 data
|
||||
const icd10Data = this.readExcelFile(
|
||||
'prisma/seed/icd/[PUBLIC] ICD-10 e-klaim.xlsx',
|
||||
'ICD10',
|
||||
);
|
||||
|
||||
// Clear existing data
|
||||
await this.prisma.icdCode.deleteMany({});
|
||||
console.log('Cleared existing ICD data');
|
||||
|
||||
// Insert ICD-9 data
|
||||
const icd9Count = await this.bulkInsertData(icd9Data, 'ICD9');
|
||||
console.log(`Imported ${icd9Count} ICD-9 codes`);
|
||||
|
||||
// Insert ICD-10 data
|
||||
const icd10Count = await this.bulkInsertData(icd10Data, 'ICD10');
|
||||
console.log(`Imported ${icd10Count} ICD-10 codes`);
|
||||
|
||||
const total = icd9Count + icd10Count;
|
||||
console.log(`Total imported: ${total} ICD codes`);
|
||||
|
||||
return {
|
||||
icd9Count,
|
||||
icd10Count,
|
||||
total,
|
||||
};
|
||||
} catch (error) {
|
||||
console.error('Error importing ICD data:', error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
private readExcelFile(filePath: string, category: string): IcdData[] {
|
||||
try {
|
||||
const fullPath = path.join(process.cwd(), filePath);
|
||||
|
||||
if (!fs.existsSync(fullPath)) {
|
||||
throw new Error(`File not found: ${fullPath}`);
|
||||
}
|
||||
|
||||
console.log(`Reading ${category} file: ${filePath}`);
|
||||
|
||||
const workbook = XLSX.readFile(fullPath);
|
||||
const sheetName = workbook.SheetNames[0];
|
||||
const worksheet = workbook.Sheets[sheetName];
|
||||
|
||||
// Convert sheet to JSON
|
||||
const jsonData = XLSX.utils.sheet_to_json(worksheet, { header: 1 });
|
||||
|
||||
// Skip header row and process data
|
||||
const icdData: IcdData[] = [];
|
||||
|
||||
for (let i = 1; i < jsonData.length; i++) {
|
||||
const row = jsonData[i] as any[];
|
||||
|
||||
if (row && row.length >= 3) {
|
||||
const code = this.cleanString(row[0]);
|
||||
const display = this.cleanString(row[1]);
|
||||
const version = this.cleanString(row[2]);
|
||||
|
||||
if (code && display && version) {
|
||||
icdData.push({
|
||||
code,
|
||||
display,
|
||||
version,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`Found ${icdData.length} valid ${category} records`);
|
||||
return icdData;
|
||||
} catch (error) {
|
||||
console.error(`Error reading ${category} file:`, error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
private async bulkInsertData(
|
||||
data: IcdData[],
|
||||
category: string,
|
||||
): Promise<number> {
|
||||
try {
|
||||
const batchSize = 1000;
|
||||
let totalInserted = 0;
|
||||
|
||||
for (let i = 0; i < data.length; i += batchSize) {
|
||||
const batch = data.slice(i, i + batchSize);
|
||||
|
||||
const insertData = batch.map((item) => ({
|
||||
code: item.code,
|
||||
display: item.display,
|
||||
version: item.version,
|
||||
category,
|
||||
}));
|
||||
|
||||
await this.prisma.icdCode.createMany({
|
||||
data: insertData,
|
||||
skipDuplicates: true,
|
||||
});
|
||||
|
||||
totalInserted += batch.length;
|
||||
console.log(
|
||||
`Inserted batch ${Math.floor(i / batchSize) + 1} for ${category}: ${batch.length} records`,
|
||||
);
|
||||
}
|
||||
|
||||
return totalInserted;
|
||||
} catch (error) {
|
||||
console.error(`Error inserting ${category} data:`, error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
private cleanString(value: any): string {
|
||||
if (value === null || value === undefined) {
|
||||
return '';
|
||||
}
|
||||
return String(value).trim();
|
||||
}
|
||||
|
||||
async disconnect() {
|
||||
await this.prisma.$disconnect();
|
||||
}
|
||||
}
|
||||
|
||||
// Standalone execution
|
||||
if (require.main === module) {
|
||||
const seeder = new IcdSeeder();
|
||||
seeder
|
||||
.seed()
|
||||
.then((result) => {
|
||||
console.log('ICD seeding completed successfully:', result);
|
||||
})
|
||||
.catch((error) => {
|
||||
console.error('ICD seeding failed:', error);
|
||||
process.exit(1);
|
||||
})
|
||||
.finally(() => {
|
||||
void seeder.disconnect();
|
||||
});
|
||||
}
|
||||
27
prisma/seed/seed.ts
Normal file
27
prisma/seed/seed.ts
Normal file
@@ -0,0 +1,27 @@
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import { IcdSeeder } from './icd/icd.seed';
|
||||
|
||||
const prisma = new PrismaClient();
|
||||
|
||||
async function main() {
|
||||
console.log('Starting database seeding...');
|
||||
|
||||
try {
|
||||
// Seed ICD data
|
||||
const icdSeeder = new IcdSeeder();
|
||||
const icdResult = await icdSeeder.seed();
|
||||
console.log('ICD seeding completed:', icdResult);
|
||||
await icdSeeder.disconnect();
|
||||
|
||||
console.log('Database seeding completed successfully!');
|
||||
} catch (error) {
|
||||
console.error('Error during seeding:', error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
main()
|
||||
.catch(console.error)
|
||||
.finally(() => {
|
||||
void prisma.$disconnect();
|
||||
});
|
||||
@@ -64,7 +64,7 @@ export class HealthController {
|
||||
status: 200,
|
||||
description: 'Application is ready',
|
||||
})
|
||||
async getReady() {
|
||||
getReady() {
|
||||
return { status: 'ready' };
|
||||
}
|
||||
|
||||
@@ -77,7 +77,7 @@ export class HealthController {
|
||||
status: 200,
|
||||
description: 'Application is alive',
|
||||
})
|
||||
async getLive() {
|
||||
getLive() {
|
||||
return { status: 'alive' };
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import { Controller, Get, Post, Query, Logger } from '@nestjs/common';
|
||||
import { Controller, Get, Query, Logger } from '@nestjs/common';
|
||||
import {
|
||||
ApiTags,
|
||||
ApiOperation,
|
||||
@@ -8,10 +8,8 @@ import {
|
||||
ApiInternalServerErrorResponse,
|
||||
} from '@nestjs/swagger';
|
||||
import { IcdService } from './icd.service';
|
||||
import { SearchIcdDto } from './dto/search-icd.dto';
|
||||
import {
|
||||
IcdSearchResponseDto,
|
||||
IcdImportResponseDto,
|
||||
IcdStatisticsResponseDto,
|
||||
ErrorResponseDto,
|
||||
} from './dto/icd-response.dto';
|
||||
@@ -23,40 +21,6 @@ export class IcdController {
|
||||
|
||||
constructor(private readonly icdService: IcdService) {}
|
||||
|
||||
@Post('import')
|
||||
@ApiOperation({
|
||||
summary: 'Import ICD data from Excel files',
|
||||
description:
|
||||
'Import ICD-9 and ICD-10 codes from Excel files located in the test directory. This operation will process both ICD files and insert/update the database with the latest codes.',
|
||||
})
|
||||
@ApiResponse({
|
||||
status: 200,
|
||||
description: 'ICD data imported successfully',
|
||||
type: IcdImportResponseDto,
|
||||
})
|
||||
@ApiBadRequestResponse({
|
||||
description: 'Bad request - Invalid file format or missing files',
|
||||
type: ErrorResponseDto,
|
||||
})
|
||||
@ApiInternalServerErrorResponse({
|
||||
description: 'Internal server error during import process',
|
||||
type: ErrorResponseDto,
|
||||
})
|
||||
async importData(): Promise<IcdImportResponseDto> {
|
||||
try {
|
||||
this.logger.log('Starting ICD data import...');
|
||||
const result = await this.icdService.importIcdData();
|
||||
return {
|
||||
success: true,
|
||||
message: 'ICD data imported successfully',
|
||||
data: result,
|
||||
};
|
||||
} catch (error) {
|
||||
this.logger.error('Error importing ICD data:', error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
@Get('search')
|
||||
@ApiOperation({
|
||||
summary: 'Search ICD codes with filters and pagination',
|
||||
|
||||
@@ -1,158 +1,11 @@
|
||||
import { Injectable, Logger } from '@nestjs/common';
|
||||
import { PrismaClient } from '../../generated/prisma';
|
||||
import * as XLSX from 'xlsx';
|
||||
import * as path from 'path';
|
||||
import * as fs from 'fs';
|
||||
|
||||
interface IcdData {
|
||||
code: string;
|
||||
display: string;
|
||||
version: string;
|
||||
}
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
|
||||
@Injectable()
|
||||
export class IcdService {
|
||||
private readonly logger = new Logger(IcdService.name);
|
||||
private readonly prisma = new PrismaClient();
|
||||
|
||||
async importIcdData(): Promise<{
|
||||
icd9Count: number;
|
||||
icd10Count: number;
|
||||
total: number;
|
||||
}> {
|
||||
try {
|
||||
this.logger.log('Starting ICD data import...');
|
||||
|
||||
// Import ICD-9 data
|
||||
const icd9Data = await this.readExcelFile(
|
||||
'test/[PUBLIC] ICD-9CM e-klaim.xlsx',
|
||||
'ICD9',
|
||||
);
|
||||
|
||||
// Import ICD-10 data
|
||||
const icd10Data = await this.readExcelFile(
|
||||
'test/[PUBLIC] ICD-10 e-klaim.xlsx',
|
||||
'ICD10',
|
||||
);
|
||||
|
||||
// Clear existing data
|
||||
await this.prisma.icdCode.deleteMany({});
|
||||
this.logger.log('Cleared existing ICD data');
|
||||
|
||||
// Insert ICD-9 data
|
||||
const icd9Count = await this.bulkInsertData(icd9Data, 'ICD9');
|
||||
this.logger.log(`Imported ${icd9Count} ICD-9 codes`);
|
||||
|
||||
// Insert ICD-10 data
|
||||
const icd10Count = await this.bulkInsertData(icd10Data, 'ICD10');
|
||||
this.logger.log(`Imported ${icd10Count} ICD-10 codes`);
|
||||
|
||||
const total = icd9Count + icd10Count;
|
||||
this.logger.log(`Total imported: ${total} ICD codes`);
|
||||
|
||||
return {
|
||||
icd9Count,
|
||||
icd10Count,
|
||||
total,
|
||||
};
|
||||
} catch (error) {
|
||||
this.logger.error('Error importing ICD data:', error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
private async readExcelFile(
|
||||
filePath: string,
|
||||
category: string,
|
||||
): Promise<IcdData[]> {
|
||||
try {
|
||||
const fullPath = path.join(process.cwd(), filePath);
|
||||
|
||||
if (!fs.existsSync(fullPath)) {
|
||||
throw new Error(`File not found: ${fullPath}`);
|
||||
}
|
||||
|
||||
this.logger.log(`Reading ${category} file: ${filePath}`);
|
||||
|
||||
const workbook = XLSX.readFile(fullPath);
|
||||
const sheetName = workbook.SheetNames[0];
|
||||
const worksheet = workbook.Sheets[sheetName];
|
||||
|
||||
// Convert sheet to JSON
|
||||
const jsonData = XLSX.utils.sheet_to_json(worksheet, { header: 1 });
|
||||
|
||||
// Skip header row and process data
|
||||
const icdData: IcdData[] = [];
|
||||
|
||||
for (let i = 1; i < jsonData.length; i++) {
|
||||
const row = jsonData[i] as any[];
|
||||
|
||||
if (row && row.length >= 3) {
|
||||
const code = this.cleanString(row[0]);
|
||||
const display = this.cleanString(row[1]);
|
||||
const version = this.cleanString(row[2]);
|
||||
|
||||
if (code && display && version) {
|
||||
icdData.push({
|
||||
code,
|
||||
display,
|
||||
version,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
this.logger.log(`Found ${icdData.length} valid ${category} records`);
|
||||
return icdData;
|
||||
} catch (error) {
|
||||
this.logger.error(`Error reading ${category} file:`, error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
private async bulkInsertData(
|
||||
data: IcdData[],
|
||||
category: string,
|
||||
): Promise<number> {
|
||||
try {
|
||||
const batchSize = 1000;
|
||||
let totalInserted = 0;
|
||||
|
||||
for (let i = 0; i < data.length; i += batchSize) {
|
||||
const batch = data.slice(i, i + batchSize);
|
||||
|
||||
const insertData = batch.map((item) => ({
|
||||
code: item.code,
|
||||
display: item.display,
|
||||
version: item.version,
|
||||
category,
|
||||
}));
|
||||
|
||||
await this.prisma.icdCode.createMany({
|
||||
data: insertData,
|
||||
skipDuplicates: true,
|
||||
});
|
||||
|
||||
totalInserted += batch.length;
|
||||
this.logger.log(
|
||||
`Inserted batch ${Math.floor(i / batchSize) + 1} for ${category}: ${batch.length} records`,
|
||||
);
|
||||
}
|
||||
|
||||
return totalInserted;
|
||||
} catch (error) {
|
||||
this.logger.error(`Error inserting ${category} data:`, error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
private cleanString(value: any): string {
|
||||
if (value === null || value === undefined) {
|
||||
return '';
|
||||
}
|
||||
return String(value).trim();
|
||||
}
|
||||
|
||||
async findIcdCodes(
|
||||
category?: string,
|
||||
search?: string,
|
||||
|
||||
@@ -8,6 +8,15 @@ import {
|
||||
ValidationPipe,
|
||||
UsePipes,
|
||||
} from '@nestjs/common';
|
||||
import {
|
||||
IsString,
|
||||
IsOptional,
|
||||
IsNumber,
|
||||
IsEnum,
|
||||
Min,
|
||||
Max,
|
||||
IsNotEmpty,
|
||||
} from 'class-validator';
|
||||
import {
|
||||
ApiTags,
|
||||
ApiOperation,
|
||||
@@ -27,6 +36,8 @@ export class VectorSearchDto {
|
||||
minLength: 1,
|
||||
maxLength: 500,
|
||||
})
|
||||
@IsString()
|
||||
@IsNotEmpty()
|
||||
query: string;
|
||||
|
||||
@ApiProperty({
|
||||
@@ -37,6 +48,10 @@ export class VectorSearchDto {
|
||||
maximum: 100,
|
||||
default: 10,
|
||||
})
|
||||
@IsOptional()
|
||||
@IsNumber()
|
||||
@Min(1)
|
||||
@Max(100)
|
||||
limit?: number;
|
||||
|
||||
@ApiProperty({
|
||||
@@ -46,16 +61,22 @@ export class VectorSearchDto {
|
||||
enum: ['ICD9', 'ICD10'],
|
||||
default: undefined,
|
||||
})
|
||||
@IsOptional()
|
||||
@IsEnum(['ICD9', 'ICD10'])
|
||||
category?: string;
|
||||
|
||||
@ApiProperty({
|
||||
description: 'Similarity threshold (0.0 - 1.0) for filtering results',
|
||||
example: 0.7,
|
||||
example: 0.85,
|
||||
required: false,
|
||||
minimum: 0.0,
|
||||
maximum: 1.0,
|
||||
default: 0.7,
|
||||
default: 0.85,
|
||||
})
|
||||
@IsOptional()
|
||||
@IsNumber()
|
||||
@Min(0.0)
|
||||
@Max(1.0)
|
||||
threshold?: number;
|
||||
}
|
||||
|
||||
@@ -66,6 +87,8 @@ export class EmbeddingRequestDto {
|
||||
minLength: 1,
|
||||
maxLength: 1000,
|
||||
})
|
||||
@IsString()
|
||||
@IsNotEmpty()
|
||||
text: string;
|
||||
|
||||
@ApiProperty({
|
||||
@@ -74,9 +97,24 @@ export class EmbeddingRequestDto {
|
||||
required: false,
|
||||
default: 'text-embedding-ada-002',
|
||||
})
|
||||
@IsOptional()
|
||||
@IsString()
|
||||
model?: string;
|
||||
}
|
||||
|
||||
export class ThresholdConfigDto {
|
||||
@ApiProperty({
|
||||
description: 'Similarity threshold value (0.0 - 1.0)',
|
||||
example: 0.85,
|
||||
minimum: 0.0,
|
||||
maximum: 1.0,
|
||||
})
|
||||
@IsNumber()
|
||||
@Min(0.0)
|
||||
@Max(1.0)
|
||||
threshold: number;
|
||||
}
|
||||
|
||||
export class VectorSearchResponseDto {
|
||||
@ApiProperty({
|
||||
description: 'Array of search results with similarity scores',
|
||||
@@ -486,6 +524,61 @@ export class PgVectorController {
|
||||
};
|
||||
}
|
||||
|
||||
@Post('advanced-search')
|
||||
@ApiOperation({
|
||||
summary: 'Advanced vector similarity search',
|
||||
description:
|
||||
'Advanced vector search using multiple similarity metrics (cosine + euclidean) for more accurate results with higher threshold.',
|
||||
tags: ['PgVector Operations'],
|
||||
})
|
||||
@ApiConsumes('application/json')
|
||||
@ApiProduces('application/json')
|
||||
@ApiBody({
|
||||
type: VectorSearchDto,
|
||||
description: 'Search parameters for advanced vector search',
|
||||
examples: {
|
||||
highPrecision: {
|
||||
summary: 'High precision search',
|
||||
value: {
|
||||
query: 'diabetes mellitus type 2',
|
||||
limit: 10,
|
||||
category: 'ICD10',
|
||||
threshold: 0.9,
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
@ApiResponse({
|
||||
status: HttpStatus.OK,
|
||||
description:
|
||||
'Advanced vector search results with enhanced similarity scores',
|
||||
type: VectorSearchResponseDto,
|
||||
})
|
||||
@ApiResponse({
|
||||
status: HttpStatus.BAD_REQUEST,
|
||||
description: 'Invalid search parameters',
|
||||
})
|
||||
@ApiResponse({
|
||||
status: HttpStatus.INTERNAL_SERVER_ERROR,
|
||||
description: 'Error during advanced vector search',
|
||||
})
|
||||
async advancedVectorSearch(
|
||||
@Body() searchDto: VectorSearchDto,
|
||||
): Promise<VectorSearchResponseDto> {
|
||||
const results = await this.pgVectorService.advancedVectorSearch(
|
||||
searchDto.query,
|
||||
searchDto.limit || 10,
|
||||
searchDto.category,
|
||||
searchDto.threshold,
|
||||
);
|
||||
|
||||
return {
|
||||
data: results,
|
||||
total: results.length,
|
||||
query: searchDto.query,
|
||||
};
|
||||
}
|
||||
|
||||
@Post('generate-embedding')
|
||||
@ApiOperation({
|
||||
summary: 'Generate text embedding',
|
||||
@@ -570,6 +663,50 @@ export class PgVectorController {
|
||||
};
|
||||
}
|
||||
|
||||
@Post('regenerate-embeddings-enhanced')
|
||||
@ApiOperation({
|
||||
summary: 'Regenerate embeddings with enhanced text representation',
|
||||
description:
|
||||
'Regenerate existing embeddings using enhanced text representation for better similarity scores. This improves search quality.',
|
||||
tags: ['PgVector Operations'],
|
||||
})
|
||||
@ApiConsumes('application/json')
|
||||
@ApiProduces('application/json')
|
||||
@ApiResponse({
|
||||
status: HttpStatus.OK,
|
||||
description: 'Enhanced embedding regeneration results summary',
|
||||
schema: {
|
||||
type: 'object',
|
||||
properties: {
|
||||
processed: { type: 'number', example: 100 },
|
||||
errors: { type: 'number', example: 0 },
|
||||
totalSample: { type: 'number', example: 100 },
|
||||
message: {
|
||||
type: 'string',
|
||||
example: 'Enhanced embeddings regenerated successfully',
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
@ApiResponse({
|
||||
status: HttpStatus.INTERNAL_SERVER_ERROR,
|
||||
description: 'Error during enhanced embedding regeneration',
|
||||
})
|
||||
async regenerateEmbeddingsEnhanced(): Promise<{
|
||||
processed: number;
|
||||
errors: number;
|
||||
totalSample: number;
|
||||
message: string;
|
||||
}> {
|
||||
const result =
|
||||
await this.pgVectorService.regenerateEmbeddingsWithEnhancedText();
|
||||
|
||||
return {
|
||||
...result,
|
||||
message: `Enhanced embeddings regenerated successfully. Processed: ${result.processed}, Errors: ${result.errors}`,
|
||||
};
|
||||
}
|
||||
|
||||
@Get('stats')
|
||||
@ApiOperation({
|
||||
summary: 'Get embedding statistics',
|
||||
@@ -640,6 +777,234 @@ export class PgVectorController {
|
||||
};
|
||||
}
|
||||
|
||||
@Get('threshold')
|
||||
@ApiOperation({
|
||||
summary: 'Get current similarity threshold',
|
||||
description:
|
||||
'Get the current similarity threshold configuration used for vector search filtering.',
|
||||
tags: ['PgVector Operations'],
|
||||
})
|
||||
@ApiProduces('application/json')
|
||||
@ApiResponse({
|
||||
status: HttpStatus.OK,
|
||||
description: 'Current similarity threshold configuration',
|
||||
schema: {
|
||||
type: 'object',
|
||||
properties: {
|
||||
threshold: {
|
||||
type: 'number',
|
||||
description: 'Current similarity threshold value',
|
||||
example: 0.85,
|
||||
},
|
||||
description: {
|
||||
type: 'string',
|
||||
description: 'Description of the threshold setting',
|
||||
example: 'Minimum similarity score required for search results',
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
@ApiResponse({
|
||||
status: HttpStatus.INTERNAL_SERVER_ERROR,
|
||||
description: 'Error retrieving threshold configuration',
|
||||
})
|
||||
async getSimilarityThreshold(): Promise<{
|
||||
threshold: number;
|
||||
description: string;
|
||||
}> {
|
||||
const threshold = this.pgVectorService.getSimilarityThreshold();
|
||||
return {
|
||||
threshold,
|
||||
description:
|
||||
'Minimum similarity score required for search results (0.0 - 1.0)',
|
||||
};
|
||||
}
|
||||
|
||||
@Get('model')
|
||||
@ApiOperation({
|
||||
summary: 'Get current embedding model',
|
||||
description:
|
||||
'Get the current OpenAI embedding model configuration used for vector generation.',
|
||||
tags: ['PgVector Operations'],
|
||||
})
|
||||
@ApiProduces('application/json')
|
||||
@ApiResponse({
|
||||
status: HttpStatus.OK,
|
||||
description: 'Current embedding model configuration',
|
||||
schema: {
|
||||
type: 'object',
|
||||
properties: {
|
||||
model: {
|
||||
type: 'string',
|
||||
description: 'Current embedding model name',
|
||||
example: 'text-embedding-ada-002',
|
||||
},
|
||||
description: {
|
||||
type: 'string',
|
||||
description: 'Description of the model configuration',
|
||||
example: 'OpenAI embedding model for vector generation',
|
||||
},
|
||||
source: {
|
||||
type: 'string',
|
||||
description: 'Source of the model configuration',
|
||||
example: 'Environment Variable',
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
@ApiResponse({
|
||||
status: HttpStatus.INTERNAL_SERVER_ERROR,
|
||||
description: 'Error retrieving model configuration',
|
||||
})
|
||||
async getEmbeddingModel(): Promise<{
|
||||
model: string;
|
||||
description: string;
|
||||
source: string;
|
||||
}> {
|
||||
const model = this.pgVectorService.getEmbeddingModel();
|
||||
const source = process.env.OPENAI_API_MODEL
|
||||
? 'Environment Variable'
|
||||
: 'Default';
|
||||
|
||||
return {
|
||||
model,
|
||||
description: 'OpenAI embedding model for vector generation',
|
||||
source,
|
||||
};
|
||||
}
|
||||
|
||||
@Post('threshold')
|
||||
@ApiOperation({
|
||||
summary: 'Set similarity threshold',
|
||||
description:
|
||||
'Set the similarity threshold for vector search filtering. Higher values result in more strict matching.',
|
||||
tags: ['PgVector Operations'],
|
||||
})
|
||||
@ApiConsumes('application/json')
|
||||
@ApiProduces('application/json')
|
||||
@ApiBody({
|
||||
type: ThresholdConfigDto,
|
||||
description: 'Threshold configuration parameters',
|
||||
})
|
||||
@ApiResponse({
|
||||
status: HttpStatus.OK,
|
||||
description: 'Similarity threshold updated successfully',
|
||||
schema: {
|
||||
type: 'object',
|
||||
properties: {
|
||||
message: {
|
||||
type: 'string',
|
||||
description: 'Success message',
|
||||
example: 'Similarity threshold updated successfully',
|
||||
},
|
||||
threshold: {
|
||||
type: 'number',
|
||||
description: 'Updated threshold value',
|
||||
example: 0.9,
|
||||
},
|
||||
previousThreshold: {
|
||||
type: 'number',
|
||||
description: 'Previous threshold value',
|
||||
example: 0.85,
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
@ApiResponse({
|
||||
status: HttpStatus.BAD_REQUEST,
|
||||
description: 'Invalid threshold value (must be between 0.0 and 1.0)',
|
||||
})
|
||||
@ApiResponse({
|
||||
status: HttpStatus.INTERNAL_SERVER_ERROR,
|
||||
description: 'Error updating threshold configuration',
|
||||
})
|
||||
async setSimilarityThreshold(
|
||||
@Body() thresholdConfig: ThresholdConfigDto,
|
||||
): Promise<{
|
||||
message: string;
|
||||
threshold: number;
|
||||
previousThreshold: number;
|
||||
}> {
|
||||
const previousThreshold = this.pgVectorService.getSimilarityThreshold();
|
||||
this.pgVectorService.setSimilarityThreshold(thresholdConfig.threshold);
|
||||
|
||||
return {
|
||||
message: 'Similarity threshold updated successfully',
|
||||
threshold: thresholdConfig.threshold,
|
||||
previousThreshold,
|
||||
};
|
||||
}
|
||||
|
||||
@Post('model')
|
||||
@ApiOperation({
|
||||
summary: 'Set embedding model',
|
||||
description:
|
||||
'Set the OpenAI embedding model for vector generation. This will reinitialize the embeddings service.',
|
||||
tags: ['PgVector Operations'],
|
||||
})
|
||||
@ApiConsumes('application/json')
|
||||
@ApiProduces('application/json')
|
||||
@ApiBody({
|
||||
schema: {
|
||||
type: 'object',
|
||||
properties: {
|
||||
model: {
|
||||
type: 'string',
|
||||
description: 'OpenAI embedding model name',
|
||||
example: 'text-embedding-ada-002',
|
||||
},
|
||||
},
|
||||
required: ['model'],
|
||||
},
|
||||
description: 'Model configuration parameters',
|
||||
})
|
||||
@ApiResponse({
|
||||
status: HttpStatus.OK,
|
||||
description: 'Embedding model updated successfully',
|
||||
schema: {
|
||||
type: 'object',
|
||||
properties: {
|
||||
message: {
|
||||
type: 'string',
|
||||
description: 'Success message',
|
||||
example: 'Embedding model updated successfully',
|
||||
},
|
||||
model: {
|
||||
type: 'string',
|
||||
description: 'Updated model name',
|
||||
example: 'text-embedding-ada-002',
|
||||
},
|
||||
previousModel: {
|
||||
type: 'string',
|
||||
description: 'Previous model name',
|
||||
example: 'text-embedding-ada-002',
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
@ApiResponse({
|
||||
status: HttpStatus.BAD_REQUEST,
|
||||
description: 'Invalid model name',
|
||||
})
|
||||
@ApiResponse({
|
||||
status: HttpStatus.INTERNAL_SERVER_ERROR,
|
||||
description: 'Error updating model configuration',
|
||||
})
|
||||
async setEmbeddingModel(@Body() body: { model: string }): Promise<{
|
||||
message: string;
|
||||
model: string;
|
||||
previousModel: string;
|
||||
}> {
|
||||
const previousModel = this.pgVectorService.getEmbeddingModel();
|
||||
await this.pgVectorService.setEmbeddingModel(body.model);
|
||||
|
||||
return {
|
||||
message: 'Embedding model updated successfully',
|
||||
model: body.model,
|
||||
previousModel,
|
||||
};
|
||||
}
|
||||
|
||||
@Post('refresh')
|
||||
@ApiOperation({
|
||||
summary: 'Refresh pgvector store',
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
import { Injectable, Logger } from '@nestjs/common';
|
||||
import { PrismaClient } from '../../generated/prisma';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import { OpenAIEmbeddings } from '@langchain/openai';
|
||||
import { PGVectorStore } from '@langchain/community/vectorstores/pgvector';
|
||||
import { Document } from 'langchain/document';
|
||||
import { Pool } from 'pg';
|
||||
|
||||
export interface VectorSearchResult {
|
||||
@@ -72,6 +71,41 @@ export class PgVectorService {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reinitialize OpenAI embeddings with new model
|
||||
*/
|
||||
private async reinitializeEmbeddings(modelName: string): Promise<void> {
|
||||
try {
|
||||
const apiKey = process.env.OPENAI_API_KEY;
|
||||
if (!apiKey) {
|
||||
throw new Error('OPENAI_API_KEY not found');
|
||||
}
|
||||
|
||||
this.logger.log(
|
||||
`Reinitializing OpenAI embeddings with model: ${modelName}`,
|
||||
);
|
||||
|
||||
// Create new embeddings instance with new model
|
||||
this.embeddings = new OpenAIEmbeddings({
|
||||
openAIApiKey: apiKey,
|
||||
modelName: modelName,
|
||||
maxConcurrency: 5,
|
||||
});
|
||||
|
||||
// Update environment variable to reflect current model
|
||||
process.env.OPENAI_API_MODEL = modelName;
|
||||
|
||||
this.logger.log(
|
||||
`OpenAI embeddings reinitialized successfully with model: ${modelName}`,
|
||||
);
|
||||
} catch (error) {
|
||||
this.logger.error('Failed to reinitialize OpenAI embeddings:', error);
|
||||
throw new Error(
|
||||
`Failed to reinitialize OpenAI embeddings: ${error.message}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize pgvector store dengan LangChain
|
||||
*/
|
||||
@@ -115,25 +149,35 @@ export class PgVectorService {
|
||||
/**
|
||||
* Generate embedding untuk text menggunakan OpenAI
|
||||
*/
|
||||
async generateEmbedding(
|
||||
text: string,
|
||||
model: string = 'text-embedding-ada-002',
|
||||
): Promise<number[]> {
|
||||
async generateEmbedding(text: string, model?: string): Promise<number[]> {
|
||||
try {
|
||||
// Get model from parameter, environment variable, or use default
|
||||
const embeddingModel =
|
||||
model || process.env.OPENAI_API_MODEL || 'text-embedding-ada-002';
|
||||
|
||||
this.logger.log(
|
||||
`Generating embedding for text: ${text.substring(0, 100)}...`,
|
||||
`Generating embedding for text: ${text.substring(0, 100)}... using model: ${embeddingModel}`,
|
||||
);
|
||||
|
||||
// Check if we need to reinitialize embeddings with new model
|
||||
const currentModel = this.getEmbeddingModel();
|
||||
if (model && model !== currentModel) {
|
||||
this.logger.log(
|
||||
`Switching embedding model from ${currentModel} to ${model}`,
|
||||
);
|
||||
await this.reinitializeEmbeddings(model);
|
||||
}
|
||||
|
||||
if (!this.embeddings) {
|
||||
throw new Error(
|
||||
'OpenAI embeddings not initialized. Please check your API configuration.',
|
||||
);
|
||||
}
|
||||
|
||||
// Use OpenAI embeddings
|
||||
// Use OpenAI embeddings with current model
|
||||
const embedding = await this.embeddings.embedQuery(text);
|
||||
this.logger.log(
|
||||
`Generated OpenAI embedding with ${embedding.length} dimensions`,
|
||||
`Generated OpenAI embedding with ${embedding.length} dimensions using model: ${this.getEmbeddingModel()}`,
|
||||
);
|
||||
return embedding;
|
||||
} catch (error) {
|
||||
@@ -191,7 +235,8 @@ export class PgVectorService {
|
||||
`UPDATE icd_codes
|
||||
SET embedding = $1::vector,
|
||||
metadata = $2::jsonb,
|
||||
content = $3
|
||||
content = $3,
|
||||
"updatedAt" = NOW()
|
||||
WHERE id = $4`,
|
||||
[
|
||||
vectorString,
|
||||
@@ -289,7 +334,8 @@ export class PgVectorService {
|
||||
`UPDATE icd_codes
|
||||
SET embedding = $1::vector,
|
||||
metadata = $2::jsonb,
|
||||
content = $3
|
||||
content = $3,
|
||||
"updatedAt" = NOW()
|
||||
WHERE id = $4`,
|
||||
[
|
||||
vectorString,
|
||||
@@ -337,16 +383,23 @@ export class PgVectorService {
|
||||
}
|
||||
|
||||
/**
|
||||
* Vector similarity search menggunakan pgvector
|
||||
* Vector similarity search menggunakan pgvector dengan threshold yang dapat dikonfigurasi
|
||||
*/
|
||||
async vectorSearch(
|
||||
query: string,
|
||||
limit: number = 10,
|
||||
category?: string,
|
||||
threshold: number = 0.7,
|
||||
threshold?: number,
|
||||
): Promise<VectorSearchResult[]> {
|
||||
// Get threshold from environment variable or use default
|
||||
const defaultThreshold = parseFloat(
|
||||
process.env.VECTOR_SIMILARITY_THRESHOLD || '0.85',
|
||||
);
|
||||
const similarityThreshold = threshold || defaultThreshold;
|
||||
try {
|
||||
this.logger.log(`Performing pgvector search for: ${query}`);
|
||||
this.logger.log(
|
||||
`Performing pgvector search for: ${query} with threshold: ${similarityThreshold}`,
|
||||
);
|
||||
|
||||
if (!this.embeddings) {
|
||||
throw new Error('OpenAI embeddings not initialized');
|
||||
@@ -358,17 +411,19 @@ export class PgVectorService {
|
||||
// Convert embedding array to proper vector format for pgvector
|
||||
const vectorString = `[${queryEmbedding.join(',')}]`;
|
||||
|
||||
// Build SQL query for vector similarity search
|
||||
// Build SQL query for vector similarity search with higher precision
|
||||
// Using cosine distance and converting to similarity score
|
||||
let sql = `
|
||||
SELECT
|
||||
id, code, display, version, category,
|
||||
1 - (embedding <=> $1::vector) as similarity
|
||||
(1 - (embedding <=> $1::vector)) as similarity
|
||||
FROM icd_codes
|
||||
WHERE embedding IS NOT NULL
|
||||
AND (1 - (embedding <=> $1::vector)) >= $2
|
||||
`;
|
||||
|
||||
const params: any[] = [vectorString];
|
||||
let paramIndex = 2;
|
||||
const params: any[] = [vectorString, similarityThreshold];
|
||||
let paramIndex = 3;
|
||||
|
||||
if (category) {
|
||||
sql += ` AND category = $${paramIndex}`;
|
||||
@@ -376,23 +431,24 @@ export class PgVectorService {
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
sql += ` ORDER BY embedding <=> $1::vector ASC LIMIT $${paramIndex}`;
|
||||
// Order by similarity descending and limit results
|
||||
sql += ` ORDER BY similarity DESC LIMIT $${paramIndex}`;
|
||||
params.push(limit);
|
||||
|
||||
// Execute raw SQL query
|
||||
const result = await this.pool.query(sql, params);
|
||||
|
||||
// Transform and filter results
|
||||
const filteredResults: VectorSearchResult[] = result.rows
|
||||
.filter((row: any) => row.similarity >= threshold)
|
||||
.map((row: any) => ({
|
||||
// Transform results (no need to filter again since SQL already filters)
|
||||
const filteredResults: VectorSearchResult[] = result.rows.map(
|
||||
(row: any) => ({
|
||||
id: row.id,
|
||||
code: row.code,
|
||||
display: row.display,
|
||||
version: row.version,
|
||||
category: row.category,
|
||||
similarity: parseFloat(row.similarity),
|
||||
}));
|
||||
}),
|
||||
);
|
||||
|
||||
this.logger.log(
|
||||
`Pgvector search returned ${filteredResults.length} results for query: "${query}"`,
|
||||
@@ -417,12 +473,12 @@ export class PgVectorService {
|
||||
try {
|
||||
this.logger.log(`Performing hybrid search for: ${query}`);
|
||||
|
||||
// Get vector search results
|
||||
// Get vector search results with higher threshold
|
||||
const vectorResults = await this.vectorSearch(
|
||||
query,
|
||||
limit * 2,
|
||||
category,
|
||||
0.5,
|
||||
parseFloat(process.env.VECTOR_SIMILARITY_THRESHOLD || '0.85'),
|
||||
);
|
||||
|
||||
// Get text search results
|
||||
@@ -475,7 +531,7 @@ export class PgVectorService {
|
||||
try {
|
||||
let sql = 'SELECT id, code, display, version, category FROM icd_codes';
|
||||
const params: any[] = [];
|
||||
let whereConditions: string[] = [];
|
||||
const whereConditions: string[] = [];
|
||||
let paramIndex = 1;
|
||||
|
||||
if (category) {
|
||||
@@ -515,6 +571,51 @@ export class PgVectorService {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get current similarity threshold configuration
|
||||
*/
|
||||
getSimilarityThreshold(): number {
|
||||
return parseFloat(process.env.VECTOR_SIMILARITY_THRESHOLD || '0.85');
|
||||
}
|
||||
|
||||
/**
|
||||
* Get current embedding model configuration
|
||||
*/
|
||||
getEmbeddingModel(): string {
|
||||
return process.env.OPENAI_API_MODEL || 'text-embedding-ada-002';
|
||||
}
|
||||
|
||||
/**
|
||||
* Set similarity threshold (for runtime configuration)
|
||||
*/
|
||||
setSimilarityThreshold(threshold: number): void {
|
||||
if (threshold < 0 || threshold > 1) {
|
||||
throw new Error('Similarity threshold must be between 0 and 1');
|
||||
}
|
||||
process.env.VECTOR_SIMILARITY_THRESHOLD = threshold.toString();
|
||||
this.logger.log(`Similarity threshold updated to: ${threshold}`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Set embedding model (for runtime configuration)
|
||||
*/
|
||||
async setEmbeddingModel(modelName: string): Promise<void> {
|
||||
if (!modelName || typeof modelName !== 'string') {
|
||||
throw new Error('Model name must be a valid string');
|
||||
}
|
||||
|
||||
const currentModel = this.getEmbeddingModel();
|
||||
if (modelName === currentModel) {
|
||||
this.logger.log(`Model ${modelName} is already active`);
|
||||
return;
|
||||
}
|
||||
|
||||
this.logger.log(
|
||||
`Switching embedding model from ${currentModel} to ${modelName}`,
|
||||
);
|
||||
await this.reinitializeEmbeddings(modelName);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get embedding statistics
|
||||
*/
|
||||
@@ -524,6 +625,7 @@ export class PgVectorService {
|
||||
withoutEmbeddings: number;
|
||||
percentage: number;
|
||||
vectorStoreStatus: string;
|
||||
currentThreshold: number;
|
||||
}> {
|
||||
try {
|
||||
// Use raw SQL to get embedding statistics
|
||||
@@ -548,6 +650,7 @@ export class PgVectorService {
|
||||
withoutEmbeddings,
|
||||
percentage: Math.round(percentage * 100) / 100,
|
||||
vectorStoreStatus,
|
||||
currentThreshold: this.getSimilarityThreshold(),
|
||||
};
|
||||
} catch (error) {
|
||||
this.logger.error('Error getting embedding stats:', error);
|
||||
@@ -569,6 +672,95 @@ export class PgVectorService {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Advanced vector search dengan multiple similarity metrics untuk mendapatkan hasil yang lebih akurat
|
||||
*/
|
||||
async advancedVectorSearch(
|
||||
query: string,
|
||||
limit: number = 10,
|
||||
category?: string,
|
||||
threshold?: number,
|
||||
): Promise<VectorSearchResult[]> {
|
||||
try {
|
||||
// Get threshold from environment variable or use default
|
||||
const defaultThreshold = parseFloat(
|
||||
process.env.VECTOR_SIMILARITY_THRESHOLD || '0.85',
|
||||
);
|
||||
const similarityThreshold = threshold || defaultThreshold;
|
||||
|
||||
this.logger.log(
|
||||
`Performing advanced vector search for: ${query} with threshold: ${similarityThreshold}`,
|
||||
);
|
||||
|
||||
if (!this.embeddings) {
|
||||
throw new Error('OpenAI embeddings not initialized');
|
||||
}
|
||||
|
||||
// Generate embedding for query
|
||||
const queryEmbedding = await this.generateEmbedding(query);
|
||||
const vectorString = `[${queryEmbedding.join(',')}]`;
|
||||
|
||||
// Advanced SQL query using multiple similarity metrics
|
||||
let sql = `
|
||||
SELECT
|
||||
id, code, display, version, category,
|
||||
(1 - (embedding <=> $1::vector)) as cosine_similarity,
|
||||
(1 - (embedding <-> $1::vector)) as euclidean_similarity,
|
||||
(embedding <#> $1::vector) as negative_inner_product
|
||||
FROM icd_codes
|
||||
WHERE embedding IS NOT NULL
|
||||
`;
|
||||
|
||||
const params: any[] = [vectorString];
|
||||
let paramIndex = 2;
|
||||
|
||||
if (category) {
|
||||
sql += ` AND category = $${paramIndex}`;
|
||||
params.push(category);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
// Filter by cosine similarity threshold
|
||||
sql += ` AND (1 - (embedding <=> $1::vector)) >= $${paramIndex}`;
|
||||
params.push(similarityThreshold);
|
||||
paramIndex++;
|
||||
|
||||
// Order by combined similarity score and limit
|
||||
sql += ` ORDER BY cosine_similarity DESC, euclidean_similarity DESC LIMIT $${paramIndex}`;
|
||||
params.push(limit);
|
||||
|
||||
const result = await this.pool.query(sql, params);
|
||||
|
||||
// Transform results with enhanced similarity scoring
|
||||
const filteredResults: VectorSearchResult[] = result.rows.map(
|
||||
(row: any) => {
|
||||
const cosineSim = parseFloat(row.cosine_similarity);
|
||||
const euclideanSim = parseFloat(row.euclidean_similarity);
|
||||
|
||||
// Calculate combined similarity score (weighted average)
|
||||
const combinedSimilarity = cosineSim * 0.7 + euclideanSim * 0.3;
|
||||
|
||||
return {
|
||||
id: row.id,
|
||||
code: row.code,
|
||||
display: row.display,
|
||||
version: row.version,
|
||||
category: row.category,
|
||||
similarity: Math.round(combinedSimilarity * 1000) / 1000, // Round to 3 decimal places
|
||||
};
|
||||
},
|
||||
);
|
||||
|
||||
this.logger.log(
|
||||
`Advanced vector search returned ${filteredResults.length} results for query: "${query}" with threshold: ${similarityThreshold}`,
|
||||
);
|
||||
return filteredResults;
|
||||
} catch (error) {
|
||||
this.logger.error('Error in advanced vector search:', error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get vector store status
|
||||
*/
|
||||
@@ -589,9 +781,10 @@ export class PgVectorService {
|
||||
initialized: !!this.vectorStore,
|
||||
documentCount,
|
||||
embeddingModel: this.embeddings
|
||||
? `OpenAI ${process.env.OPENAI_API_MODEL || 'text-embedding-ada-002'}`
|
||||
? `OpenAI ${this.getEmbeddingModel()}`
|
||||
: 'Not Available',
|
||||
lastUpdated: new Date(),
|
||||
currentThreshold: this.getSimilarityThreshold(),
|
||||
};
|
||||
|
||||
return status;
|
||||
@@ -601,6 +794,164 @@ export class PgVectorService {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create enhanced text representation for better embedding quality
|
||||
*/
|
||||
private createEnhancedTextRepresentation(code: any): string {
|
||||
// Base text with code and display
|
||||
let text = `${code.code} ${code.display}`;
|
||||
|
||||
// Add category context
|
||||
if (code.category === 'ICD9') {
|
||||
text += ` ICD-9 CM procedure diagnosis`;
|
||||
} else if (code.category === 'ICD10') {
|
||||
text += ` ICD-10 diagnosis condition`;
|
||||
}
|
||||
|
||||
// Add version context
|
||||
if (code.version) {
|
||||
text += ` ${code.version}`;
|
||||
}
|
||||
|
||||
// Add medical context based on display content
|
||||
const display = code.display.toLowerCase();
|
||||
|
||||
// Add procedure context
|
||||
if (
|
||||
display.includes('procedure') ||
|
||||
display.includes('surgery') ||
|
||||
display.includes('operation')
|
||||
) {
|
||||
text += ' medical procedure surgical intervention';
|
||||
}
|
||||
|
||||
// Add diagnosis context
|
||||
if (
|
||||
display.includes('diagnosis') ||
|
||||
display.includes('condition') ||
|
||||
display.includes('disease')
|
||||
) {
|
||||
text += ' medical diagnosis clinical condition';
|
||||
}
|
||||
|
||||
// Add anatomical context
|
||||
if (
|
||||
display.includes('cranial') ||
|
||||
display.includes('brain') ||
|
||||
display.includes('head')
|
||||
) {
|
||||
text += ' neurological cranial brain head';
|
||||
}
|
||||
|
||||
if (
|
||||
display.includes('cardiac') ||
|
||||
display.includes('heart') ||
|
||||
display.includes('cardiovascular')
|
||||
) {
|
||||
text += ' cardiac heart cardiovascular';
|
||||
}
|
||||
|
||||
if (
|
||||
display.includes('pulmonary') ||
|
||||
display.includes('lung') ||
|
||||
display.includes('respiratory')
|
||||
) {
|
||||
text += ' pulmonary lung respiratory';
|
||||
}
|
||||
|
||||
// Add common medical terms
|
||||
text += ' medical healthcare clinical';
|
||||
|
||||
return text;
|
||||
}
|
||||
|
||||
/**
|
||||
* Regenerate embeddings with enhanced text representation for better similarity
|
||||
*/
|
||||
async regenerateEmbeddingsWithEnhancedText(limit: number = 100): Promise<{
|
||||
processed: number;
|
||||
errors: number;
|
||||
totalSample: number;
|
||||
}> {
|
||||
try {
|
||||
this.logger.log(
|
||||
`Starting enhanced embedding regeneration for ${limit} ICD codes...`,
|
||||
);
|
||||
|
||||
// Get ICD codes with existing embeddings to regenerate
|
||||
const codesWithEmbeddings = await this.pool.query(
|
||||
'SELECT id, code, display, version, category FROM icd_codes WHERE embedding IS NOT NULL LIMIT $1',
|
||||
[limit],
|
||||
);
|
||||
|
||||
if (codesWithEmbeddings.rows.length === 0) {
|
||||
this.logger.log('No ICD codes found with embeddings to regenerate');
|
||||
return { processed: 0, errors: 0, totalSample: 0 };
|
||||
}
|
||||
|
||||
this.logger.log(
|
||||
`Found ${codesWithEmbeddings.rows.length} codes to regenerate with enhanced text`,
|
||||
);
|
||||
|
||||
let processed = 0;
|
||||
let errors = 0;
|
||||
|
||||
// Process each code
|
||||
for (let i = 0; i < codesWithEmbeddings.rows.length; i++) {
|
||||
const code = codesWithEmbeddings.rows[i];
|
||||
try {
|
||||
// Create enhanced text representation for better embedding quality
|
||||
const text = this.createEnhancedTextRepresentation(code);
|
||||
|
||||
// Generate new embedding
|
||||
const embedding = await this.generateEmbedding(text);
|
||||
|
||||
// Convert embedding array to proper vector format for pgvector
|
||||
const vectorString = `[${embedding.join(',')}]`;
|
||||
|
||||
// Update database with new embedding and enhanced content
|
||||
await this.pool.query(
|
||||
`UPDATE icd_codes
|
||||
SET embedding = $1::vector,
|
||||
content = $2,
|
||||
"updatedAt" = NOW()
|
||||
WHERE id = $3`,
|
||||
[vectorString, text, code.id],
|
||||
);
|
||||
|
||||
processed++;
|
||||
|
||||
if (processed % 10 === 0) {
|
||||
this.logger.log(
|
||||
`Regenerated ${processed}/${codesWithEmbeddings.rows.length} enhanced embeddings`,
|
||||
);
|
||||
}
|
||||
} catch (error) {
|
||||
this.logger.error(
|
||||
`Error regenerating embedding for code ${code.code}:`,
|
||||
error,
|
||||
);
|
||||
errors++;
|
||||
}
|
||||
}
|
||||
|
||||
this.logger.log(
|
||||
`Enhanced embedding regeneration completed. Processed: ${processed}, Errors: ${errors}, Total: ${codesWithEmbeddings.rows.length}`,
|
||||
);
|
||||
return {
|
||||
processed,
|
||||
errors,
|
||||
totalSample: codesWithEmbeddings.rows.length,
|
||||
};
|
||||
} catch (error) {
|
||||
this.logger.error(
|
||||
'Error in regenerateEmbeddingsWithEnhancedText:',
|
||||
error,
|
||||
);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Cleanup resources
|
||||
*/
|
||||
|
||||
Reference in New Issue
Block a user