try fix similaruuty and add seed for master excel icd
This commit is contained in:
@@ -176,4 +176,6 @@ METRICS_PATH=/metrics
|
|||||||
HEALTH_CHECK_PATH=/health
|
HEALTH_CHECK_PATH=/health
|
||||||
|
|
||||||
OPENAI_API_KEY=xxxxxx
|
OPENAI_API_KEY=xxxxxx
|
||||||
OPENAI_API_MODEL=text-embedding-ada-002
|
OPENAI_API_MODEL=text-embedding-ada-002
|
||||||
|
|
||||||
|
VECTOR_SIMILARITY_THRESHOLD=0.85
|
||||||
@@ -1,7 +1,8 @@
|
|||||||
|
version: '3.8'
|
||||||
|
|
||||||
services:
|
services:
|
||||||
# PostgreSQL Database with pgvector extension
|
|
||||||
postgres:
|
postgres:
|
||||||
image: pgvector/pgvector:pg15
|
image: pgvector/pgvector:pg17
|
||||||
container_name: claim-guard-postgres
|
container_name: claim-guard-postgres
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
environment:
|
environment:
|
||||||
@@ -9,7 +10,7 @@ services:
|
|||||||
POSTGRES_USER: postgres
|
POSTGRES_USER: postgres
|
||||||
POSTGRES_PASSWORD: postgres123
|
POSTGRES_PASSWORD: postgres123
|
||||||
ports:
|
ports:
|
||||||
- '5432:5432'
|
- '5433:5432' # host:container → akses dari host pakai port 5433
|
||||||
volumes:
|
volumes:
|
||||||
- postgres_data:/var/lib/postgresql/data
|
- postgres_data:/var/lib/postgresql/data
|
||||||
- ./docker/postgres/init:/docker-entrypoint-initdb.d
|
- ./docker/postgres/init:/docker-entrypoint-initdb.d
|
||||||
|
|||||||
73
docs/ENVIRONMENT_VARIABLES.md
Normal file
73
docs/ENVIRONMENT_VARIABLES.md
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
# Environment Variables
|
||||||
|
|
||||||
|
## Database Configuration
|
||||||
|
- `DATABASE_URL`: PostgreSQL connection string
|
||||||
|
- Example: `postgresql://username:password@localhost:5432/claim_guard_db`
|
||||||
|
|
||||||
|
## OpenAI Configuration
|
||||||
|
- `OPENAI_API_KEY`: Your OpenAI API key for embeddings
|
||||||
|
- `OPENAI_API_MODEL`: OpenAI model for embeddings (default: `text-embedding-ada-002`)
|
||||||
|
|
||||||
|
## Vector Search Configuration
|
||||||
|
- `VECTOR_SIMILARITY_THRESHOLD`: Minimum similarity threshold for vector search (default: `0.85`)
|
||||||
|
- Range: 0.0 to 1.0
|
||||||
|
- Higher values = more strict matching
|
||||||
|
- Recommended: 0.85 for production, 0.7 for development
|
||||||
|
|
||||||
|
## Application Configuration
|
||||||
|
- `PORT`: Application port (default: 3000)
|
||||||
|
- `NODE_ENV`: Environment mode (development/production)
|
||||||
|
|
||||||
|
## Example .env file
|
||||||
|
```bash
|
||||||
|
# Database
|
||||||
|
DATABASE_URL="postgresql://username:password@localhost:5432/claim_guard_db"
|
||||||
|
|
||||||
|
# OpenAI
|
||||||
|
OPENAI_API_KEY="your-openai-api-key-here"
|
||||||
|
OPENAI_API_MODEL="text-embedding-ada-002"
|
||||||
|
|
||||||
|
# Vector Search
|
||||||
|
VECTOR_SIMILARITY_THRESHOLD=0.85
|
||||||
|
|
||||||
|
# App
|
||||||
|
PORT=3000
|
||||||
|
NODE_ENV=development
|
||||||
|
```
|
||||||
|
|
||||||
|
## Similarity Threshold Guidelines
|
||||||
|
|
||||||
|
### Production Environment
|
||||||
|
- **High Precision**: 0.90 - 0.95 (very strict matching)
|
||||||
|
- **Standard**: 0.85 - 0.90 (recommended for most use cases)
|
||||||
|
- **Balanced**: 0.80 - 0.85 (good balance between precision and recall)
|
||||||
|
|
||||||
|
### Development Environment
|
||||||
|
- **Testing**: 0.70 - 0.80 (more lenient for testing)
|
||||||
|
- **Debugging**: 0.60 - 0.70 (very lenient for development)
|
||||||
|
|
||||||
|
### How to Set Threshold
|
||||||
|
|
||||||
|
#### Via Environment Variable
|
||||||
|
```bash
|
||||||
|
export VECTOR_SIMILARITY_THRESHOLD=0.90
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Via .env file
|
||||||
|
```bash
|
||||||
|
VECTOR_SIMILARITY_THRESHOLD=0.90
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Via API (Runtime)
|
||||||
|
```bash
|
||||||
|
POST /api/pgvector/threshold
|
||||||
|
{
|
||||||
|
"threshold": 0.90
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Impact of Threshold Changes
|
||||||
|
|
||||||
|
- **Higher Threshold (0.90+)**: Fewer results, higher precision, more relevant matches
|
||||||
|
- **Lower Threshold (0.70-)**: More results, lower precision, may include less relevant matches
|
||||||
|
- **Optimal Range (0.80-0.90)**: Good balance between precision and recall for most medical coding use cases
|
||||||
243
docs/SIMILARITY_THRESHOLD.md
Normal file
243
docs/SIMILARITY_THRESHOLD.md
Normal file
@@ -0,0 +1,243 @@
|
|||||||
|
# Similarity Threshold Configuration
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The similarity threshold feature allows you to control the precision of vector search results by setting a minimum similarity score required for results to be returned. This ensures that only highly relevant matches are included in search results.
|
||||||
|
|
||||||
|
## Default Configuration
|
||||||
|
|
||||||
|
- **Default Threshold**: `0.85` (85% similarity)
|
||||||
|
- **Environment Variable**: `VECTOR_SIMILARITY_THRESHOLD`
|
||||||
|
- **Range**: 0.0 to 1.0 (0% to 100% similarity)
|
||||||
|
|
||||||
|
## API Endpoints
|
||||||
|
|
||||||
|
### 1. Get Current Threshold
|
||||||
|
|
||||||
|
```http
|
||||||
|
GET /api/pgvector/threshold
|
||||||
|
```
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"threshold": 0.85,
|
||||||
|
"description": "Minimum similarity score required for search results (0.0 - 1.0)"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Set Threshold
|
||||||
|
|
||||||
|
```http
|
||||||
|
POST /api/pgvector/threshold
|
||||||
|
Content-Type: application/json
|
||||||
|
|
||||||
|
{
|
||||||
|
"threshold": 0.90
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"message": "Similarity threshold updated successfully",
|
||||||
|
"threshold": 0.9,
|
||||||
|
"previousThreshold": 0.85
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Advanced Vector Search
|
||||||
|
|
||||||
|
```http
|
||||||
|
POST /api/pgvector/advanced-search
|
||||||
|
Content-Type: application/json
|
||||||
|
|
||||||
|
{
|
||||||
|
"query": "diabetes mellitus type 2",
|
||||||
|
"limit": 10,
|
||||||
|
"category": "ICD10",
|
||||||
|
"threshold": 0.90
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Search Methods
|
||||||
|
|
||||||
|
### Standard Vector Search
|
||||||
|
|
||||||
|
- Uses cosine similarity
|
||||||
|
- Default threshold from environment variable
|
||||||
|
- Good for general use cases
|
||||||
|
|
||||||
|
### Advanced Vector Search
|
||||||
|
|
||||||
|
- Combines cosine and euclidean similarity metrics
|
||||||
|
- Weighted scoring: 70% cosine + 30% euclidean
|
||||||
|
- Higher precision results
|
||||||
|
- Recommended for production use
|
||||||
|
|
||||||
|
### Hybrid Search
|
||||||
|
|
||||||
|
- Combines vector similarity with text search
|
||||||
|
- Uses threshold from environment variable
|
||||||
|
- Best balance of semantic and text matching
|
||||||
|
|
||||||
|
## Threshold Recommendations
|
||||||
|
|
||||||
|
### Medical Coding Use Cases
|
||||||
|
|
||||||
|
| Use Case | Recommended Threshold | Description |
|
||||||
|
| ---------------------------- | --------------------- | --------------------------------------------- |
|
||||||
|
| **High Precision Diagnosis** | 0.90 - 0.95 | Very strict matching for critical diagnoses |
|
||||||
|
| **Standard Medical Coding** | 0.85 - 0.90 | Recommended for most medical coding scenarios |
|
||||||
|
| **General Medical Search** | 0.80 - 0.85 | Good balance between precision and recall |
|
||||||
|
| **Research & Exploration** | 0.70 - 0.80 | More lenient for research purposes |
|
||||||
|
|
||||||
|
### Environment-Specific Settings
|
||||||
|
|
||||||
|
#### Production Environment
|
||||||
|
|
||||||
|
```bash
|
||||||
|
VECTOR_SIMILARITY_THRESHOLD=0.85
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Development Environment
|
||||||
|
|
||||||
|
```bash
|
||||||
|
VECTOR_SIMILARITY_THRESHOLD=0.70
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Testing Environment
|
||||||
|
|
||||||
|
```bash
|
||||||
|
VECTOR_SIMILARITY_THRESHOLD=0.75
|
||||||
|
```
|
||||||
|
|
||||||
|
## Implementation Details
|
||||||
|
|
||||||
|
### Environment Variable
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Set in .env file
|
||||||
|
VECTOR_SIMILARITY_THRESHOLD=0.85
|
||||||
|
|
||||||
|
# Or set as system environment variable
|
||||||
|
export VECTOR_SIMILARITY_THRESHOLD=0.85
|
||||||
|
```
|
||||||
|
|
||||||
|
### Runtime Configuration
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
// Get current threshold
|
||||||
|
const currentThreshold = pgVectorService.getSimilarityThreshold();
|
||||||
|
|
||||||
|
// Set new threshold
|
||||||
|
pgVectorService.setSimilarityThreshold(0.9);
|
||||||
|
```
|
||||||
|
|
||||||
|
### SQL Query Optimization
|
||||||
|
|
||||||
|
The system automatically optimizes SQL queries to:
|
||||||
|
|
||||||
|
- Filter results at database level using threshold
|
||||||
|
- Order results by similarity score
|
||||||
|
- Use appropriate vector similarity operators
|
||||||
|
|
||||||
|
## Performance Impact
|
||||||
|
|
||||||
|
### Higher Threshold (0.90+)
|
||||||
|
|
||||||
|
- ✅ Fewer results to process
|
||||||
|
- ✅ Higher precision
|
||||||
|
- ❌ May miss relevant results
|
||||||
|
- ❌ Slower query execution (more filtering)
|
||||||
|
|
||||||
|
### Lower Threshold (0.70-)
|
||||||
|
|
||||||
|
- ✅ Faster query execution
|
||||||
|
- ✅ More comprehensive results
|
||||||
|
- ❌ Lower precision
|
||||||
|
- ❌ More irrelevant results
|
||||||
|
|
||||||
|
### Optimal Range (0.80-0.90)
|
||||||
|
|
||||||
|
- ✅ Good balance of precision and performance
|
||||||
|
- ✅ Suitable for most medical coding scenarios
|
||||||
|
- ✅ Reasonable query execution time
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Common Issues
|
||||||
|
|
||||||
|
1. **No Results Returned**
|
||||||
|
- Check if threshold is too high
|
||||||
|
- Verify embeddings are generated
|
||||||
|
- Check database connection
|
||||||
|
|
||||||
|
2. **Too Many Results**
|
||||||
|
- Increase threshold value
|
||||||
|
- Use advanced search method
|
||||||
|
- Add category filters
|
||||||
|
|
||||||
|
3. **Performance Issues**
|
||||||
|
- Optimize threshold for your use case
|
||||||
|
- Use database indexes
|
||||||
|
- Consider batch processing
|
||||||
|
|
||||||
|
### Debug Commands
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check current threshold
|
||||||
|
curl -X GET http://localhost:3000/api/pgvector/threshold
|
||||||
|
|
||||||
|
# Get embedding statistics
|
||||||
|
curl -X GET http://localhost:3000/api/pgvector/stats
|
||||||
|
|
||||||
|
# Test with different thresholds
|
||||||
|
curl -X POST http://localhost:3000/api/pgvector/advanced-search \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{"query": "test", "threshold": 0.80}'
|
||||||
|
```
|
||||||
|
|
||||||
|
## Best Practices
|
||||||
|
|
||||||
|
1. **Start with Default**: Begin with threshold 0.85
|
||||||
|
2. **Test Incrementally**: Adjust threshold in small increments (0.05)
|
||||||
|
3. **Monitor Results**: Evaluate precision vs. recall trade-offs
|
||||||
|
4. **Environment Specific**: Use different thresholds for different environments
|
||||||
|
5. **Document Changes**: Keep track of threshold changes and their impact
|
||||||
|
|
||||||
|
## Migration Guide
|
||||||
|
|
||||||
|
### From Previous Version
|
||||||
|
|
||||||
|
If upgrading from a version without configurable threshold:
|
||||||
|
|
||||||
|
1. **Set Environment Variable**:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
VECTOR_SIMILARITY_THRESHOLD=0.85
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Update Search Calls**:
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
// Old way (hardcoded 0.7)
|
||||||
|
const results = await service.vectorSearch(query, limit, category, 0.7);
|
||||||
|
|
||||||
|
// New way (uses environment variable)
|
||||||
|
const results = await service.vectorSearch(query, limit, category);
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Test New Thresholds**:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Test with current threshold
|
||||||
|
curl -X GET http://localhost:3000/api/pgvector/threshold
|
||||||
|
|
||||||
|
# Adjust if needed
|
||||||
|
curl -X POST http://localhost:3000/api/pgvector/threshold \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{"threshold": 0.90}'
|
||||||
|
```
|
||||||
@@ -7,7 +7,7 @@
|
|||||||
"license": "UNLICENSED",
|
"license": "UNLICENSED",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"build": "nest build",
|
"build": "nest build",
|
||||||
"format": "prettier --write \"src/**/*.ts\" \"test/**/*.ts\"",
|
"format": "prettier --write \"src/**/*.ts\" \"test/**/*.ts\" \"prisma/seed/**/*.ts\"",
|
||||||
"start": "nest start",
|
"start": "nest start",
|
||||||
"start:dev": "nest start --watch",
|
"start:dev": "nest start --watch",
|
||||||
"start:debug": "nest start --debug --watch",
|
"start:debug": "nest start --debug --watch",
|
||||||
@@ -17,7 +17,9 @@
|
|||||||
"test:watch": "jest --watch",
|
"test:watch": "jest --watch",
|
||||||
"test:cov": "jest --coverage",
|
"test:cov": "jest --coverage",
|
||||||
"test:debug": "node --inspect-brk -r tsconfig-paths/register -r ts-node/register node_modules/.bin/jest --runInBand",
|
"test:debug": "node --inspect-brk -r tsconfig-paths/register -r ts-node/register node_modules/.bin/jest --runInBand",
|
||||||
"test:e2e": "jest --config ./test/jest-e2e.json"
|
"test:e2e": "jest --config ./test/jest-e2e.json",
|
||||||
|
"seed": "ts-node -r tsconfig-paths/register prisma/seed/seed.ts",
|
||||||
|
"seed:icd": "ts-node -r tsconfig-paths/register prisma/seed/icd/icd.seed.ts"
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@langchain/community": "^0.3.53",
|
"@langchain/community": "^0.3.53",
|
||||||
|
|||||||
170
prisma/seed/icd/icd.seed.ts
Normal file
170
prisma/seed/icd/icd.seed.ts
Normal file
@@ -0,0 +1,170 @@
|
|||||||
|
import { PrismaClient } from '@prisma/client';
|
||||||
|
import * as XLSX from 'xlsx';
|
||||||
|
import * as path from 'path';
|
||||||
|
import * as fs from 'fs';
|
||||||
|
|
||||||
|
interface IcdData {
|
||||||
|
code: string;
|
||||||
|
display: string;
|
||||||
|
version: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export class IcdSeeder {
|
||||||
|
private readonly prisma = new PrismaClient();
|
||||||
|
|
||||||
|
async seed(): Promise<{
|
||||||
|
icd9Count: number;
|
||||||
|
icd10Count: number;
|
||||||
|
total: number;
|
||||||
|
}> {
|
||||||
|
try {
|
||||||
|
console.log('Starting ICD data import...');
|
||||||
|
|
||||||
|
// Import ICD-9 data
|
||||||
|
const icd9Data = this.readExcelFile(
|
||||||
|
'prisma/seed/icd/[PUBLIC] ICD-9CM e-klaim.xlsx',
|
||||||
|
'ICD9',
|
||||||
|
);
|
||||||
|
|
||||||
|
// Import ICD-10 data
|
||||||
|
const icd10Data = this.readExcelFile(
|
||||||
|
'prisma/seed/icd/[PUBLIC] ICD-10 e-klaim.xlsx',
|
||||||
|
'ICD10',
|
||||||
|
);
|
||||||
|
|
||||||
|
// Clear existing data
|
||||||
|
await this.prisma.icdCode.deleteMany({});
|
||||||
|
console.log('Cleared existing ICD data');
|
||||||
|
|
||||||
|
// Insert ICD-9 data
|
||||||
|
const icd9Count = await this.bulkInsertData(icd9Data, 'ICD9');
|
||||||
|
console.log(`Imported ${icd9Count} ICD-9 codes`);
|
||||||
|
|
||||||
|
// Insert ICD-10 data
|
||||||
|
const icd10Count = await this.bulkInsertData(icd10Data, 'ICD10');
|
||||||
|
console.log(`Imported ${icd10Count} ICD-10 codes`);
|
||||||
|
|
||||||
|
const total = icd9Count + icd10Count;
|
||||||
|
console.log(`Total imported: ${total} ICD codes`);
|
||||||
|
|
||||||
|
return {
|
||||||
|
icd9Count,
|
||||||
|
icd10Count,
|
||||||
|
total,
|
||||||
|
};
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error importing ICD data:', error);
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private readExcelFile(filePath: string, category: string): IcdData[] {
|
||||||
|
try {
|
||||||
|
const fullPath = path.join(process.cwd(), filePath);
|
||||||
|
|
||||||
|
if (!fs.existsSync(fullPath)) {
|
||||||
|
throw new Error(`File not found: ${fullPath}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`Reading ${category} file: ${filePath}`);
|
||||||
|
|
||||||
|
const workbook = XLSX.readFile(fullPath);
|
||||||
|
const sheetName = workbook.SheetNames[0];
|
||||||
|
const worksheet = workbook.Sheets[sheetName];
|
||||||
|
|
||||||
|
// Convert sheet to JSON
|
||||||
|
const jsonData = XLSX.utils.sheet_to_json(worksheet, { header: 1 });
|
||||||
|
|
||||||
|
// Skip header row and process data
|
||||||
|
const icdData: IcdData[] = [];
|
||||||
|
|
||||||
|
for (let i = 1; i < jsonData.length; i++) {
|
||||||
|
const row = jsonData[i] as any[];
|
||||||
|
|
||||||
|
if (row && row.length >= 3) {
|
||||||
|
const code = this.cleanString(row[0]);
|
||||||
|
const display = this.cleanString(row[1]);
|
||||||
|
const version = this.cleanString(row[2]);
|
||||||
|
|
||||||
|
if (code && display && version) {
|
||||||
|
icdData.push({
|
||||||
|
code,
|
||||||
|
display,
|
||||||
|
version,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`Found ${icdData.length} valid ${category} records`);
|
||||||
|
return icdData;
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Error reading ${category} file:`, error);
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private async bulkInsertData(
|
||||||
|
data: IcdData[],
|
||||||
|
category: string,
|
||||||
|
): Promise<number> {
|
||||||
|
try {
|
||||||
|
const batchSize = 1000;
|
||||||
|
let totalInserted = 0;
|
||||||
|
|
||||||
|
for (let i = 0; i < data.length; i += batchSize) {
|
||||||
|
const batch = data.slice(i, i + batchSize);
|
||||||
|
|
||||||
|
const insertData = batch.map((item) => ({
|
||||||
|
code: item.code,
|
||||||
|
display: item.display,
|
||||||
|
version: item.version,
|
||||||
|
category,
|
||||||
|
}));
|
||||||
|
|
||||||
|
await this.prisma.icdCode.createMany({
|
||||||
|
data: insertData,
|
||||||
|
skipDuplicates: true,
|
||||||
|
});
|
||||||
|
|
||||||
|
totalInserted += batch.length;
|
||||||
|
console.log(
|
||||||
|
`Inserted batch ${Math.floor(i / batchSize) + 1} for ${category}: ${batch.length} records`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
return totalInserted;
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Error inserting ${category} data:`, error);
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private cleanString(value: any): string {
|
||||||
|
if (value === null || value === undefined) {
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
return String(value).trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
async disconnect() {
|
||||||
|
await this.prisma.$disconnect();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Standalone execution
|
||||||
|
if (require.main === module) {
|
||||||
|
const seeder = new IcdSeeder();
|
||||||
|
seeder
|
||||||
|
.seed()
|
||||||
|
.then((result) => {
|
||||||
|
console.log('ICD seeding completed successfully:', result);
|
||||||
|
})
|
||||||
|
.catch((error) => {
|
||||||
|
console.error('ICD seeding failed:', error);
|
||||||
|
process.exit(1);
|
||||||
|
})
|
||||||
|
.finally(() => {
|
||||||
|
void seeder.disconnect();
|
||||||
|
});
|
||||||
|
}
|
||||||
27
prisma/seed/seed.ts
Normal file
27
prisma/seed/seed.ts
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
import { PrismaClient } from '@prisma/client';
|
||||||
|
import { IcdSeeder } from './icd/icd.seed';
|
||||||
|
|
||||||
|
const prisma = new PrismaClient();
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
console.log('Starting database seeding...');
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Seed ICD data
|
||||||
|
const icdSeeder = new IcdSeeder();
|
||||||
|
const icdResult = await icdSeeder.seed();
|
||||||
|
console.log('ICD seeding completed:', icdResult);
|
||||||
|
await icdSeeder.disconnect();
|
||||||
|
|
||||||
|
console.log('Database seeding completed successfully!');
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error during seeding:', error);
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
main()
|
||||||
|
.catch(console.error)
|
||||||
|
.finally(() => {
|
||||||
|
void prisma.$disconnect();
|
||||||
|
});
|
||||||
@@ -64,7 +64,7 @@ export class HealthController {
|
|||||||
status: 200,
|
status: 200,
|
||||||
description: 'Application is ready',
|
description: 'Application is ready',
|
||||||
})
|
})
|
||||||
async getReady() {
|
getReady() {
|
||||||
return { status: 'ready' };
|
return { status: 'ready' };
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -77,7 +77,7 @@ export class HealthController {
|
|||||||
status: 200,
|
status: 200,
|
||||||
description: 'Application is alive',
|
description: 'Application is alive',
|
||||||
})
|
})
|
||||||
async getLive() {
|
getLive() {
|
||||||
return { status: 'alive' };
|
return { status: 'alive' };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
import { Controller, Get, Post, Query, Logger } from '@nestjs/common';
|
import { Controller, Get, Query, Logger } from '@nestjs/common';
|
||||||
import {
|
import {
|
||||||
ApiTags,
|
ApiTags,
|
||||||
ApiOperation,
|
ApiOperation,
|
||||||
@@ -8,10 +8,8 @@ import {
|
|||||||
ApiInternalServerErrorResponse,
|
ApiInternalServerErrorResponse,
|
||||||
} from '@nestjs/swagger';
|
} from '@nestjs/swagger';
|
||||||
import { IcdService } from './icd.service';
|
import { IcdService } from './icd.service';
|
||||||
import { SearchIcdDto } from './dto/search-icd.dto';
|
|
||||||
import {
|
import {
|
||||||
IcdSearchResponseDto,
|
IcdSearchResponseDto,
|
||||||
IcdImportResponseDto,
|
|
||||||
IcdStatisticsResponseDto,
|
IcdStatisticsResponseDto,
|
||||||
ErrorResponseDto,
|
ErrorResponseDto,
|
||||||
} from './dto/icd-response.dto';
|
} from './dto/icd-response.dto';
|
||||||
@@ -23,40 +21,6 @@ export class IcdController {
|
|||||||
|
|
||||||
constructor(private readonly icdService: IcdService) {}
|
constructor(private readonly icdService: IcdService) {}
|
||||||
|
|
||||||
@Post('import')
|
|
||||||
@ApiOperation({
|
|
||||||
summary: 'Import ICD data from Excel files',
|
|
||||||
description:
|
|
||||||
'Import ICD-9 and ICD-10 codes from Excel files located in the test directory. This operation will process both ICD files and insert/update the database with the latest codes.',
|
|
||||||
})
|
|
||||||
@ApiResponse({
|
|
||||||
status: 200,
|
|
||||||
description: 'ICD data imported successfully',
|
|
||||||
type: IcdImportResponseDto,
|
|
||||||
})
|
|
||||||
@ApiBadRequestResponse({
|
|
||||||
description: 'Bad request - Invalid file format or missing files',
|
|
||||||
type: ErrorResponseDto,
|
|
||||||
})
|
|
||||||
@ApiInternalServerErrorResponse({
|
|
||||||
description: 'Internal server error during import process',
|
|
||||||
type: ErrorResponseDto,
|
|
||||||
})
|
|
||||||
async importData(): Promise<IcdImportResponseDto> {
|
|
||||||
try {
|
|
||||||
this.logger.log('Starting ICD data import...');
|
|
||||||
const result = await this.icdService.importIcdData();
|
|
||||||
return {
|
|
||||||
success: true,
|
|
||||||
message: 'ICD data imported successfully',
|
|
||||||
data: result,
|
|
||||||
};
|
|
||||||
} catch (error) {
|
|
||||||
this.logger.error('Error importing ICD data:', error);
|
|
||||||
throw error;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Get('search')
|
@Get('search')
|
||||||
@ApiOperation({
|
@ApiOperation({
|
||||||
summary: 'Search ICD codes with filters and pagination',
|
summary: 'Search ICD codes with filters and pagination',
|
||||||
|
|||||||
@@ -1,158 +1,11 @@
|
|||||||
import { Injectable, Logger } from '@nestjs/common';
|
import { Injectable, Logger } from '@nestjs/common';
|
||||||
import { PrismaClient } from '../../generated/prisma';
|
import { PrismaClient } from '@prisma/client';
|
||||||
import * as XLSX from 'xlsx';
|
|
||||||
import * as path from 'path';
|
|
||||||
import * as fs from 'fs';
|
|
||||||
|
|
||||||
interface IcdData {
|
|
||||||
code: string;
|
|
||||||
display: string;
|
|
||||||
version: string;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Injectable()
|
@Injectable()
|
||||||
export class IcdService {
|
export class IcdService {
|
||||||
private readonly logger = new Logger(IcdService.name);
|
private readonly logger = new Logger(IcdService.name);
|
||||||
private readonly prisma = new PrismaClient();
|
private readonly prisma = new PrismaClient();
|
||||||
|
|
||||||
async importIcdData(): Promise<{
|
|
||||||
icd9Count: number;
|
|
||||||
icd10Count: number;
|
|
||||||
total: number;
|
|
||||||
}> {
|
|
||||||
try {
|
|
||||||
this.logger.log('Starting ICD data import...');
|
|
||||||
|
|
||||||
// Import ICD-9 data
|
|
||||||
const icd9Data = await this.readExcelFile(
|
|
||||||
'test/[PUBLIC] ICD-9CM e-klaim.xlsx',
|
|
||||||
'ICD9',
|
|
||||||
);
|
|
||||||
|
|
||||||
// Import ICD-10 data
|
|
||||||
const icd10Data = await this.readExcelFile(
|
|
||||||
'test/[PUBLIC] ICD-10 e-klaim.xlsx',
|
|
||||||
'ICD10',
|
|
||||||
);
|
|
||||||
|
|
||||||
// Clear existing data
|
|
||||||
await this.prisma.icdCode.deleteMany({});
|
|
||||||
this.logger.log('Cleared existing ICD data');
|
|
||||||
|
|
||||||
// Insert ICD-9 data
|
|
||||||
const icd9Count = await this.bulkInsertData(icd9Data, 'ICD9');
|
|
||||||
this.logger.log(`Imported ${icd9Count} ICD-9 codes`);
|
|
||||||
|
|
||||||
// Insert ICD-10 data
|
|
||||||
const icd10Count = await this.bulkInsertData(icd10Data, 'ICD10');
|
|
||||||
this.logger.log(`Imported ${icd10Count} ICD-10 codes`);
|
|
||||||
|
|
||||||
const total = icd9Count + icd10Count;
|
|
||||||
this.logger.log(`Total imported: ${total} ICD codes`);
|
|
||||||
|
|
||||||
return {
|
|
||||||
icd9Count,
|
|
||||||
icd10Count,
|
|
||||||
total,
|
|
||||||
};
|
|
||||||
} catch (error) {
|
|
||||||
this.logger.error('Error importing ICD data:', error);
|
|
||||||
throw error;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private async readExcelFile(
|
|
||||||
filePath: string,
|
|
||||||
category: string,
|
|
||||||
): Promise<IcdData[]> {
|
|
||||||
try {
|
|
||||||
const fullPath = path.join(process.cwd(), filePath);
|
|
||||||
|
|
||||||
if (!fs.existsSync(fullPath)) {
|
|
||||||
throw new Error(`File not found: ${fullPath}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
this.logger.log(`Reading ${category} file: ${filePath}`);
|
|
||||||
|
|
||||||
const workbook = XLSX.readFile(fullPath);
|
|
||||||
const sheetName = workbook.SheetNames[0];
|
|
||||||
const worksheet = workbook.Sheets[sheetName];
|
|
||||||
|
|
||||||
// Convert sheet to JSON
|
|
||||||
const jsonData = XLSX.utils.sheet_to_json(worksheet, { header: 1 });
|
|
||||||
|
|
||||||
// Skip header row and process data
|
|
||||||
const icdData: IcdData[] = [];
|
|
||||||
|
|
||||||
for (let i = 1; i < jsonData.length; i++) {
|
|
||||||
const row = jsonData[i] as any[];
|
|
||||||
|
|
||||||
if (row && row.length >= 3) {
|
|
||||||
const code = this.cleanString(row[0]);
|
|
||||||
const display = this.cleanString(row[1]);
|
|
||||||
const version = this.cleanString(row[2]);
|
|
||||||
|
|
||||||
if (code && display && version) {
|
|
||||||
icdData.push({
|
|
||||||
code,
|
|
||||||
display,
|
|
||||||
version,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
this.logger.log(`Found ${icdData.length} valid ${category} records`);
|
|
||||||
return icdData;
|
|
||||||
} catch (error) {
|
|
||||||
this.logger.error(`Error reading ${category} file:`, error);
|
|
||||||
throw error;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private async bulkInsertData(
|
|
||||||
data: IcdData[],
|
|
||||||
category: string,
|
|
||||||
): Promise<number> {
|
|
||||||
try {
|
|
||||||
const batchSize = 1000;
|
|
||||||
let totalInserted = 0;
|
|
||||||
|
|
||||||
for (let i = 0; i < data.length; i += batchSize) {
|
|
||||||
const batch = data.slice(i, i + batchSize);
|
|
||||||
|
|
||||||
const insertData = batch.map((item) => ({
|
|
||||||
code: item.code,
|
|
||||||
display: item.display,
|
|
||||||
version: item.version,
|
|
||||||
category,
|
|
||||||
}));
|
|
||||||
|
|
||||||
await this.prisma.icdCode.createMany({
|
|
||||||
data: insertData,
|
|
||||||
skipDuplicates: true,
|
|
||||||
});
|
|
||||||
|
|
||||||
totalInserted += batch.length;
|
|
||||||
this.logger.log(
|
|
||||||
`Inserted batch ${Math.floor(i / batchSize) + 1} for ${category}: ${batch.length} records`,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
return totalInserted;
|
|
||||||
} catch (error) {
|
|
||||||
this.logger.error(`Error inserting ${category} data:`, error);
|
|
||||||
throw error;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private cleanString(value: any): string {
|
|
||||||
if (value === null || value === undefined) {
|
|
||||||
return '';
|
|
||||||
}
|
|
||||||
return String(value).trim();
|
|
||||||
}
|
|
||||||
|
|
||||||
async findIcdCodes(
|
async findIcdCodes(
|
||||||
category?: string,
|
category?: string,
|
||||||
search?: string,
|
search?: string,
|
||||||
|
|||||||
@@ -8,6 +8,15 @@ import {
|
|||||||
ValidationPipe,
|
ValidationPipe,
|
||||||
UsePipes,
|
UsePipes,
|
||||||
} from '@nestjs/common';
|
} from '@nestjs/common';
|
||||||
|
import {
|
||||||
|
IsString,
|
||||||
|
IsOptional,
|
||||||
|
IsNumber,
|
||||||
|
IsEnum,
|
||||||
|
Min,
|
||||||
|
Max,
|
||||||
|
IsNotEmpty,
|
||||||
|
} from 'class-validator';
|
||||||
import {
|
import {
|
||||||
ApiTags,
|
ApiTags,
|
||||||
ApiOperation,
|
ApiOperation,
|
||||||
@@ -27,6 +36,8 @@ export class VectorSearchDto {
|
|||||||
minLength: 1,
|
minLength: 1,
|
||||||
maxLength: 500,
|
maxLength: 500,
|
||||||
})
|
})
|
||||||
|
@IsString()
|
||||||
|
@IsNotEmpty()
|
||||||
query: string;
|
query: string;
|
||||||
|
|
||||||
@ApiProperty({
|
@ApiProperty({
|
||||||
@@ -37,6 +48,10 @@ export class VectorSearchDto {
|
|||||||
maximum: 100,
|
maximum: 100,
|
||||||
default: 10,
|
default: 10,
|
||||||
})
|
})
|
||||||
|
@IsOptional()
|
||||||
|
@IsNumber()
|
||||||
|
@Min(1)
|
||||||
|
@Max(100)
|
||||||
limit?: number;
|
limit?: number;
|
||||||
|
|
||||||
@ApiProperty({
|
@ApiProperty({
|
||||||
@@ -46,16 +61,22 @@ export class VectorSearchDto {
|
|||||||
enum: ['ICD9', 'ICD10'],
|
enum: ['ICD9', 'ICD10'],
|
||||||
default: undefined,
|
default: undefined,
|
||||||
})
|
})
|
||||||
|
@IsOptional()
|
||||||
|
@IsEnum(['ICD9', 'ICD10'])
|
||||||
category?: string;
|
category?: string;
|
||||||
|
|
||||||
@ApiProperty({
|
@ApiProperty({
|
||||||
description: 'Similarity threshold (0.0 - 1.0) for filtering results',
|
description: 'Similarity threshold (0.0 - 1.0) for filtering results',
|
||||||
example: 0.7,
|
example: 0.85,
|
||||||
required: false,
|
required: false,
|
||||||
minimum: 0.0,
|
minimum: 0.0,
|
||||||
maximum: 1.0,
|
maximum: 1.0,
|
||||||
default: 0.7,
|
default: 0.85,
|
||||||
})
|
})
|
||||||
|
@IsOptional()
|
||||||
|
@IsNumber()
|
||||||
|
@Min(0.0)
|
||||||
|
@Max(1.0)
|
||||||
threshold?: number;
|
threshold?: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -66,6 +87,8 @@ export class EmbeddingRequestDto {
|
|||||||
minLength: 1,
|
minLength: 1,
|
||||||
maxLength: 1000,
|
maxLength: 1000,
|
||||||
})
|
})
|
||||||
|
@IsString()
|
||||||
|
@IsNotEmpty()
|
||||||
text: string;
|
text: string;
|
||||||
|
|
||||||
@ApiProperty({
|
@ApiProperty({
|
||||||
@@ -74,9 +97,24 @@ export class EmbeddingRequestDto {
|
|||||||
required: false,
|
required: false,
|
||||||
default: 'text-embedding-ada-002',
|
default: 'text-embedding-ada-002',
|
||||||
})
|
})
|
||||||
|
@IsOptional()
|
||||||
|
@IsString()
|
||||||
model?: string;
|
model?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export class ThresholdConfigDto {
|
||||||
|
@ApiProperty({
|
||||||
|
description: 'Similarity threshold value (0.0 - 1.0)',
|
||||||
|
example: 0.85,
|
||||||
|
minimum: 0.0,
|
||||||
|
maximum: 1.0,
|
||||||
|
})
|
||||||
|
@IsNumber()
|
||||||
|
@Min(0.0)
|
||||||
|
@Max(1.0)
|
||||||
|
threshold: number;
|
||||||
|
}
|
||||||
|
|
||||||
export class VectorSearchResponseDto {
|
export class VectorSearchResponseDto {
|
||||||
@ApiProperty({
|
@ApiProperty({
|
||||||
description: 'Array of search results with similarity scores',
|
description: 'Array of search results with similarity scores',
|
||||||
@@ -486,6 +524,61 @@ export class PgVectorController {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Post('advanced-search')
|
||||||
|
@ApiOperation({
|
||||||
|
summary: 'Advanced vector similarity search',
|
||||||
|
description:
|
||||||
|
'Advanced vector search using multiple similarity metrics (cosine + euclidean) for more accurate results with higher threshold.',
|
||||||
|
tags: ['PgVector Operations'],
|
||||||
|
})
|
||||||
|
@ApiConsumes('application/json')
|
||||||
|
@ApiProduces('application/json')
|
||||||
|
@ApiBody({
|
||||||
|
type: VectorSearchDto,
|
||||||
|
description: 'Search parameters for advanced vector search',
|
||||||
|
examples: {
|
||||||
|
highPrecision: {
|
||||||
|
summary: 'High precision search',
|
||||||
|
value: {
|
||||||
|
query: 'diabetes mellitus type 2',
|
||||||
|
limit: 10,
|
||||||
|
category: 'ICD10',
|
||||||
|
threshold: 0.9,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
})
|
||||||
|
@ApiResponse({
|
||||||
|
status: HttpStatus.OK,
|
||||||
|
description:
|
||||||
|
'Advanced vector search results with enhanced similarity scores',
|
||||||
|
type: VectorSearchResponseDto,
|
||||||
|
})
|
||||||
|
@ApiResponse({
|
||||||
|
status: HttpStatus.BAD_REQUEST,
|
||||||
|
description: 'Invalid search parameters',
|
||||||
|
})
|
||||||
|
@ApiResponse({
|
||||||
|
status: HttpStatus.INTERNAL_SERVER_ERROR,
|
||||||
|
description: 'Error during advanced vector search',
|
||||||
|
})
|
||||||
|
async advancedVectorSearch(
|
||||||
|
@Body() searchDto: VectorSearchDto,
|
||||||
|
): Promise<VectorSearchResponseDto> {
|
||||||
|
const results = await this.pgVectorService.advancedVectorSearch(
|
||||||
|
searchDto.query,
|
||||||
|
searchDto.limit || 10,
|
||||||
|
searchDto.category,
|
||||||
|
searchDto.threshold,
|
||||||
|
);
|
||||||
|
|
||||||
|
return {
|
||||||
|
data: results,
|
||||||
|
total: results.length,
|
||||||
|
query: searchDto.query,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
@Post('generate-embedding')
|
@Post('generate-embedding')
|
||||||
@ApiOperation({
|
@ApiOperation({
|
||||||
summary: 'Generate text embedding',
|
summary: 'Generate text embedding',
|
||||||
@@ -570,6 +663,50 @@ export class PgVectorController {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Post('regenerate-embeddings-enhanced')
|
||||||
|
@ApiOperation({
|
||||||
|
summary: 'Regenerate embeddings with enhanced text representation',
|
||||||
|
description:
|
||||||
|
'Regenerate existing embeddings using enhanced text representation for better similarity scores. This improves search quality.',
|
||||||
|
tags: ['PgVector Operations'],
|
||||||
|
})
|
||||||
|
@ApiConsumes('application/json')
|
||||||
|
@ApiProduces('application/json')
|
||||||
|
@ApiResponse({
|
||||||
|
status: HttpStatus.OK,
|
||||||
|
description: 'Enhanced embedding regeneration results summary',
|
||||||
|
schema: {
|
||||||
|
type: 'object',
|
||||||
|
properties: {
|
||||||
|
processed: { type: 'number', example: 100 },
|
||||||
|
errors: { type: 'number', example: 0 },
|
||||||
|
totalSample: { type: 'number', example: 100 },
|
||||||
|
message: {
|
||||||
|
type: 'string',
|
||||||
|
example: 'Enhanced embeddings regenerated successfully',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
})
|
||||||
|
@ApiResponse({
|
||||||
|
status: HttpStatus.INTERNAL_SERVER_ERROR,
|
||||||
|
description: 'Error during enhanced embedding regeneration',
|
||||||
|
})
|
||||||
|
async regenerateEmbeddingsEnhanced(): Promise<{
|
||||||
|
processed: number;
|
||||||
|
errors: number;
|
||||||
|
totalSample: number;
|
||||||
|
message: string;
|
||||||
|
}> {
|
||||||
|
const result =
|
||||||
|
await this.pgVectorService.regenerateEmbeddingsWithEnhancedText();
|
||||||
|
|
||||||
|
return {
|
||||||
|
...result,
|
||||||
|
message: `Enhanced embeddings regenerated successfully. Processed: ${result.processed}, Errors: ${result.errors}`,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
@Get('stats')
|
@Get('stats')
|
||||||
@ApiOperation({
|
@ApiOperation({
|
||||||
summary: 'Get embedding statistics',
|
summary: 'Get embedding statistics',
|
||||||
@@ -640,6 +777,234 @@ export class PgVectorController {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Get('threshold')
|
||||||
|
@ApiOperation({
|
||||||
|
summary: 'Get current similarity threshold',
|
||||||
|
description:
|
||||||
|
'Get the current similarity threshold configuration used for vector search filtering.',
|
||||||
|
tags: ['PgVector Operations'],
|
||||||
|
})
|
||||||
|
@ApiProduces('application/json')
|
||||||
|
@ApiResponse({
|
||||||
|
status: HttpStatus.OK,
|
||||||
|
description: 'Current similarity threshold configuration',
|
||||||
|
schema: {
|
||||||
|
type: 'object',
|
||||||
|
properties: {
|
||||||
|
threshold: {
|
||||||
|
type: 'number',
|
||||||
|
description: 'Current similarity threshold value',
|
||||||
|
example: 0.85,
|
||||||
|
},
|
||||||
|
description: {
|
||||||
|
type: 'string',
|
||||||
|
description: 'Description of the threshold setting',
|
||||||
|
example: 'Minimum similarity score required for search results',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
})
|
||||||
|
@ApiResponse({
|
||||||
|
status: HttpStatus.INTERNAL_SERVER_ERROR,
|
||||||
|
description: 'Error retrieving threshold configuration',
|
||||||
|
})
|
||||||
|
async getSimilarityThreshold(): Promise<{
|
||||||
|
threshold: number;
|
||||||
|
description: string;
|
||||||
|
}> {
|
||||||
|
const threshold = this.pgVectorService.getSimilarityThreshold();
|
||||||
|
return {
|
||||||
|
threshold,
|
||||||
|
description:
|
||||||
|
'Minimum similarity score required for search results (0.0 - 1.0)',
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
@Get('model')
|
||||||
|
@ApiOperation({
|
||||||
|
summary: 'Get current embedding model',
|
||||||
|
description:
|
||||||
|
'Get the current OpenAI embedding model configuration used for vector generation.',
|
||||||
|
tags: ['PgVector Operations'],
|
||||||
|
})
|
||||||
|
@ApiProduces('application/json')
|
||||||
|
@ApiResponse({
|
||||||
|
status: HttpStatus.OK,
|
||||||
|
description: 'Current embedding model configuration',
|
||||||
|
schema: {
|
||||||
|
type: 'object',
|
||||||
|
properties: {
|
||||||
|
model: {
|
||||||
|
type: 'string',
|
||||||
|
description: 'Current embedding model name',
|
||||||
|
example: 'text-embedding-ada-002',
|
||||||
|
},
|
||||||
|
description: {
|
||||||
|
type: 'string',
|
||||||
|
description: 'Description of the model configuration',
|
||||||
|
example: 'OpenAI embedding model for vector generation',
|
||||||
|
},
|
||||||
|
source: {
|
||||||
|
type: 'string',
|
||||||
|
description: 'Source of the model configuration',
|
||||||
|
example: 'Environment Variable',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
})
|
||||||
|
@ApiResponse({
|
||||||
|
status: HttpStatus.INTERNAL_SERVER_ERROR,
|
||||||
|
description: 'Error retrieving model configuration',
|
||||||
|
})
|
||||||
|
async getEmbeddingModel(): Promise<{
|
||||||
|
model: string;
|
||||||
|
description: string;
|
||||||
|
source: string;
|
||||||
|
}> {
|
||||||
|
const model = this.pgVectorService.getEmbeddingModel();
|
||||||
|
const source = process.env.OPENAI_API_MODEL
|
||||||
|
? 'Environment Variable'
|
||||||
|
: 'Default';
|
||||||
|
|
||||||
|
return {
|
||||||
|
model,
|
||||||
|
description: 'OpenAI embedding model for vector generation',
|
||||||
|
source,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
@Post('threshold')
|
||||||
|
@ApiOperation({
|
||||||
|
summary: 'Set similarity threshold',
|
||||||
|
description:
|
||||||
|
'Set the similarity threshold for vector search filtering. Higher values result in more strict matching.',
|
||||||
|
tags: ['PgVector Operations'],
|
||||||
|
})
|
||||||
|
@ApiConsumes('application/json')
|
||||||
|
@ApiProduces('application/json')
|
||||||
|
@ApiBody({
|
||||||
|
type: ThresholdConfigDto,
|
||||||
|
description: 'Threshold configuration parameters',
|
||||||
|
})
|
||||||
|
@ApiResponse({
|
||||||
|
status: HttpStatus.OK,
|
||||||
|
description: 'Similarity threshold updated successfully',
|
||||||
|
schema: {
|
||||||
|
type: 'object',
|
||||||
|
properties: {
|
||||||
|
message: {
|
||||||
|
type: 'string',
|
||||||
|
description: 'Success message',
|
||||||
|
example: 'Similarity threshold updated successfully',
|
||||||
|
},
|
||||||
|
threshold: {
|
||||||
|
type: 'number',
|
||||||
|
description: 'Updated threshold value',
|
||||||
|
example: 0.9,
|
||||||
|
},
|
||||||
|
previousThreshold: {
|
||||||
|
type: 'number',
|
||||||
|
description: 'Previous threshold value',
|
||||||
|
example: 0.85,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
})
|
||||||
|
@ApiResponse({
|
||||||
|
status: HttpStatus.BAD_REQUEST,
|
||||||
|
description: 'Invalid threshold value (must be between 0.0 and 1.0)',
|
||||||
|
})
|
||||||
|
@ApiResponse({
|
||||||
|
status: HttpStatus.INTERNAL_SERVER_ERROR,
|
||||||
|
description: 'Error updating threshold configuration',
|
||||||
|
})
|
||||||
|
async setSimilarityThreshold(
|
||||||
|
@Body() thresholdConfig: ThresholdConfigDto,
|
||||||
|
): Promise<{
|
||||||
|
message: string;
|
||||||
|
threshold: number;
|
||||||
|
previousThreshold: number;
|
||||||
|
}> {
|
||||||
|
const previousThreshold = this.pgVectorService.getSimilarityThreshold();
|
||||||
|
this.pgVectorService.setSimilarityThreshold(thresholdConfig.threshold);
|
||||||
|
|
||||||
|
return {
|
||||||
|
message: 'Similarity threshold updated successfully',
|
||||||
|
threshold: thresholdConfig.threshold,
|
||||||
|
previousThreshold,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
@Post('model')
|
||||||
|
@ApiOperation({
|
||||||
|
summary: 'Set embedding model',
|
||||||
|
description:
|
||||||
|
'Set the OpenAI embedding model for vector generation. This will reinitialize the embeddings service.',
|
||||||
|
tags: ['PgVector Operations'],
|
||||||
|
})
|
||||||
|
@ApiConsumes('application/json')
|
||||||
|
@ApiProduces('application/json')
|
||||||
|
@ApiBody({
|
||||||
|
schema: {
|
||||||
|
type: 'object',
|
||||||
|
properties: {
|
||||||
|
model: {
|
||||||
|
type: 'string',
|
||||||
|
description: 'OpenAI embedding model name',
|
||||||
|
example: 'text-embedding-ada-002',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
required: ['model'],
|
||||||
|
},
|
||||||
|
description: 'Model configuration parameters',
|
||||||
|
})
|
||||||
|
@ApiResponse({
|
||||||
|
status: HttpStatus.OK,
|
||||||
|
description: 'Embedding model updated successfully',
|
||||||
|
schema: {
|
||||||
|
type: 'object',
|
||||||
|
properties: {
|
||||||
|
message: {
|
||||||
|
type: 'string',
|
||||||
|
description: 'Success message',
|
||||||
|
example: 'Embedding model updated successfully',
|
||||||
|
},
|
||||||
|
model: {
|
||||||
|
type: 'string',
|
||||||
|
description: 'Updated model name',
|
||||||
|
example: 'text-embedding-ada-002',
|
||||||
|
},
|
||||||
|
previousModel: {
|
||||||
|
type: 'string',
|
||||||
|
description: 'Previous model name',
|
||||||
|
example: 'text-embedding-ada-002',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
})
|
||||||
|
@ApiResponse({
|
||||||
|
status: HttpStatus.BAD_REQUEST,
|
||||||
|
description: 'Invalid model name',
|
||||||
|
})
|
||||||
|
@ApiResponse({
|
||||||
|
status: HttpStatus.INTERNAL_SERVER_ERROR,
|
||||||
|
description: 'Error updating model configuration',
|
||||||
|
})
|
||||||
|
async setEmbeddingModel(@Body() body: { model: string }): Promise<{
|
||||||
|
message: string;
|
||||||
|
model: string;
|
||||||
|
previousModel: string;
|
||||||
|
}> {
|
||||||
|
const previousModel = this.pgVectorService.getEmbeddingModel();
|
||||||
|
await this.pgVectorService.setEmbeddingModel(body.model);
|
||||||
|
|
||||||
|
return {
|
||||||
|
message: 'Embedding model updated successfully',
|
||||||
|
model: body.model,
|
||||||
|
previousModel,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
@Post('refresh')
|
@Post('refresh')
|
||||||
@ApiOperation({
|
@ApiOperation({
|
||||||
summary: 'Refresh pgvector store',
|
summary: 'Refresh pgvector store',
|
||||||
|
|||||||
@@ -1,8 +1,7 @@
|
|||||||
import { Injectable, Logger } from '@nestjs/common';
|
import { Injectable, Logger } from '@nestjs/common';
|
||||||
import { PrismaClient } from '../../generated/prisma';
|
import { PrismaClient } from '@prisma/client';
|
||||||
import { OpenAIEmbeddings } from '@langchain/openai';
|
import { OpenAIEmbeddings } from '@langchain/openai';
|
||||||
import { PGVectorStore } from '@langchain/community/vectorstores/pgvector';
|
import { PGVectorStore } from '@langchain/community/vectorstores/pgvector';
|
||||||
import { Document } from 'langchain/document';
|
|
||||||
import { Pool } from 'pg';
|
import { Pool } from 'pg';
|
||||||
|
|
||||||
export interface VectorSearchResult {
|
export interface VectorSearchResult {
|
||||||
@@ -72,6 +71,41 @@ export class PgVectorService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reinitialize OpenAI embeddings with new model
|
||||||
|
*/
|
||||||
|
private async reinitializeEmbeddings(modelName: string): Promise<void> {
|
||||||
|
try {
|
||||||
|
const apiKey = process.env.OPENAI_API_KEY;
|
||||||
|
if (!apiKey) {
|
||||||
|
throw new Error('OPENAI_API_KEY not found');
|
||||||
|
}
|
||||||
|
|
||||||
|
this.logger.log(
|
||||||
|
`Reinitializing OpenAI embeddings with model: ${modelName}`,
|
||||||
|
);
|
||||||
|
|
||||||
|
// Create new embeddings instance with new model
|
||||||
|
this.embeddings = new OpenAIEmbeddings({
|
||||||
|
openAIApiKey: apiKey,
|
||||||
|
modelName: modelName,
|
||||||
|
maxConcurrency: 5,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Update environment variable to reflect current model
|
||||||
|
process.env.OPENAI_API_MODEL = modelName;
|
||||||
|
|
||||||
|
this.logger.log(
|
||||||
|
`OpenAI embeddings reinitialized successfully with model: ${modelName}`,
|
||||||
|
);
|
||||||
|
} catch (error) {
|
||||||
|
this.logger.error('Failed to reinitialize OpenAI embeddings:', error);
|
||||||
|
throw new Error(
|
||||||
|
`Failed to reinitialize OpenAI embeddings: ${error.message}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Initialize pgvector store dengan LangChain
|
* Initialize pgvector store dengan LangChain
|
||||||
*/
|
*/
|
||||||
@@ -115,25 +149,35 @@ export class PgVectorService {
|
|||||||
/**
|
/**
|
||||||
* Generate embedding untuk text menggunakan OpenAI
|
* Generate embedding untuk text menggunakan OpenAI
|
||||||
*/
|
*/
|
||||||
async generateEmbedding(
|
async generateEmbedding(text: string, model?: string): Promise<number[]> {
|
||||||
text: string,
|
|
||||||
model: string = 'text-embedding-ada-002',
|
|
||||||
): Promise<number[]> {
|
|
||||||
try {
|
try {
|
||||||
|
// Get model from parameter, environment variable, or use default
|
||||||
|
const embeddingModel =
|
||||||
|
model || process.env.OPENAI_API_MODEL || 'text-embedding-ada-002';
|
||||||
|
|
||||||
this.logger.log(
|
this.logger.log(
|
||||||
`Generating embedding for text: ${text.substring(0, 100)}...`,
|
`Generating embedding for text: ${text.substring(0, 100)}... using model: ${embeddingModel}`,
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Check if we need to reinitialize embeddings with new model
|
||||||
|
const currentModel = this.getEmbeddingModel();
|
||||||
|
if (model && model !== currentModel) {
|
||||||
|
this.logger.log(
|
||||||
|
`Switching embedding model from ${currentModel} to ${model}`,
|
||||||
|
);
|
||||||
|
await this.reinitializeEmbeddings(model);
|
||||||
|
}
|
||||||
|
|
||||||
if (!this.embeddings) {
|
if (!this.embeddings) {
|
||||||
throw new Error(
|
throw new Error(
|
||||||
'OpenAI embeddings not initialized. Please check your API configuration.',
|
'OpenAI embeddings not initialized. Please check your API configuration.',
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Use OpenAI embeddings
|
// Use OpenAI embeddings with current model
|
||||||
const embedding = await this.embeddings.embedQuery(text);
|
const embedding = await this.embeddings.embedQuery(text);
|
||||||
this.logger.log(
|
this.logger.log(
|
||||||
`Generated OpenAI embedding with ${embedding.length} dimensions`,
|
`Generated OpenAI embedding with ${embedding.length} dimensions using model: ${this.getEmbeddingModel()}`,
|
||||||
);
|
);
|
||||||
return embedding;
|
return embedding;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
@@ -191,7 +235,8 @@ export class PgVectorService {
|
|||||||
`UPDATE icd_codes
|
`UPDATE icd_codes
|
||||||
SET embedding = $1::vector,
|
SET embedding = $1::vector,
|
||||||
metadata = $2::jsonb,
|
metadata = $2::jsonb,
|
||||||
content = $3
|
content = $3,
|
||||||
|
"updatedAt" = NOW()
|
||||||
WHERE id = $4`,
|
WHERE id = $4`,
|
||||||
[
|
[
|
||||||
vectorString,
|
vectorString,
|
||||||
@@ -289,7 +334,8 @@ export class PgVectorService {
|
|||||||
`UPDATE icd_codes
|
`UPDATE icd_codes
|
||||||
SET embedding = $1::vector,
|
SET embedding = $1::vector,
|
||||||
metadata = $2::jsonb,
|
metadata = $2::jsonb,
|
||||||
content = $3
|
content = $3,
|
||||||
|
"updatedAt" = NOW()
|
||||||
WHERE id = $4`,
|
WHERE id = $4`,
|
||||||
[
|
[
|
||||||
vectorString,
|
vectorString,
|
||||||
@@ -337,16 +383,23 @@ export class PgVectorService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Vector similarity search menggunakan pgvector
|
* Vector similarity search menggunakan pgvector dengan threshold yang dapat dikonfigurasi
|
||||||
*/
|
*/
|
||||||
async vectorSearch(
|
async vectorSearch(
|
||||||
query: string,
|
query: string,
|
||||||
limit: number = 10,
|
limit: number = 10,
|
||||||
category?: string,
|
category?: string,
|
||||||
threshold: number = 0.7,
|
threshold?: number,
|
||||||
): Promise<VectorSearchResult[]> {
|
): Promise<VectorSearchResult[]> {
|
||||||
|
// Get threshold from environment variable or use default
|
||||||
|
const defaultThreshold = parseFloat(
|
||||||
|
process.env.VECTOR_SIMILARITY_THRESHOLD || '0.85',
|
||||||
|
);
|
||||||
|
const similarityThreshold = threshold || defaultThreshold;
|
||||||
try {
|
try {
|
||||||
this.logger.log(`Performing pgvector search for: ${query}`);
|
this.logger.log(
|
||||||
|
`Performing pgvector search for: ${query} with threshold: ${similarityThreshold}`,
|
||||||
|
);
|
||||||
|
|
||||||
if (!this.embeddings) {
|
if (!this.embeddings) {
|
||||||
throw new Error('OpenAI embeddings not initialized');
|
throw new Error('OpenAI embeddings not initialized');
|
||||||
@@ -358,17 +411,19 @@ export class PgVectorService {
|
|||||||
// Convert embedding array to proper vector format for pgvector
|
// Convert embedding array to proper vector format for pgvector
|
||||||
const vectorString = `[${queryEmbedding.join(',')}]`;
|
const vectorString = `[${queryEmbedding.join(',')}]`;
|
||||||
|
|
||||||
// Build SQL query for vector similarity search
|
// Build SQL query for vector similarity search with higher precision
|
||||||
|
// Using cosine distance and converting to similarity score
|
||||||
let sql = `
|
let sql = `
|
||||||
SELECT
|
SELECT
|
||||||
id, code, display, version, category,
|
id, code, display, version, category,
|
||||||
1 - (embedding <=> $1::vector) as similarity
|
(1 - (embedding <=> $1::vector)) as similarity
|
||||||
FROM icd_codes
|
FROM icd_codes
|
||||||
WHERE embedding IS NOT NULL
|
WHERE embedding IS NOT NULL
|
||||||
|
AND (1 - (embedding <=> $1::vector)) >= $2
|
||||||
`;
|
`;
|
||||||
|
|
||||||
const params: any[] = [vectorString];
|
const params: any[] = [vectorString, similarityThreshold];
|
||||||
let paramIndex = 2;
|
let paramIndex = 3;
|
||||||
|
|
||||||
if (category) {
|
if (category) {
|
||||||
sql += ` AND category = $${paramIndex}`;
|
sql += ` AND category = $${paramIndex}`;
|
||||||
@@ -376,23 +431,24 @@ export class PgVectorService {
|
|||||||
paramIndex++;
|
paramIndex++;
|
||||||
}
|
}
|
||||||
|
|
||||||
sql += ` ORDER BY embedding <=> $1::vector ASC LIMIT $${paramIndex}`;
|
// Order by similarity descending and limit results
|
||||||
|
sql += ` ORDER BY similarity DESC LIMIT $${paramIndex}`;
|
||||||
params.push(limit);
|
params.push(limit);
|
||||||
|
|
||||||
// Execute raw SQL query
|
// Execute raw SQL query
|
||||||
const result = await this.pool.query(sql, params);
|
const result = await this.pool.query(sql, params);
|
||||||
|
|
||||||
// Transform and filter results
|
// Transform results (no need to filter again since SQL already filters)
|
||||||
const filteredResults: VectorSearchResult[] = result.rows
|
const filteredResults: VectorSearchResult[] = result.rows.map(
|
||||||
.filter((row: any) => row.similarity >= threshold)
|
(row: any) => ({
|
||||||
.map((row: any) => ({
|
|
||||||
id: row.id,
|
id: row.id,
|
||||||
code: row.code,
|
code: row.code,
|
||||||
display: row.display,
|
display: row.display,
|
||||||
version: row.version,
|
version: row.version,
|
||||||
category: row.category,
|
category: row.category,
|
||||||
similarity: parseFloat(row.similarity),
|
similarity: parseFloat(row.similarity),
|
||||||
}));
|
}),
|
||||||
|
);
|
||||||
|
|
||||||
this.logger.log(
|
this.logger.log(
|
||||||
`Pgvector search returned ${filteredResults.length} results for query: "${query}"`,
|
`Pgvector search returned ${filteredResults.length} results for query: "${query}"`,
|
||||||
@@ -417,12 +473,12 @@ export class PgVectorService {
|
|||||||
try {
|
try {
|
||||||
this.logger.log(`Performing hybrid search for: ${query}`);
|
this.logger.log(`Performing hybrid search for: ${query}`);
|
||||||
|
|
||||||
// Get vector search results
|
// Get vector search results with higher threshold
|
||||||
const vectorResults = await this.vectorSearch(
|
const vectorResults = await this.vectorSearch(
|
||||||
query,
|
query,
|
||||||
limit * 2,
|
limit * 2,
|
||||||
category,
|
category,
|
||||||
0.5,
|
parseFloat(process.env.VECTOR_SIMILARITY_THRESHOLD || '0.85'),
|
||||||
);
|
);
|
||||||
|
|
||||||
// Get text search results
|
// Get text search results
|
||||||
@@ -475,7 +531,7 @@ export class PgVectorService {
|
|||||||
try {
|
try {
|
||||||
let sql = 'SELECT id, code, display, version, category FROM icd_codes';
|
let sql = 'SELECT id, code, display, version, category FROM icd_codes';
|
||||||
const params: any[] = [];
|
const params: any[] = [];
|
||||||
let whereConditions: string[] = [];
|
const whereConditions: string[] = [];
|
||||||
let paramIndex = 1;
|
let paramIndex = 1;
|
||||||
|
|
||||||
if (category) {
|
if (category) {
|
||||||
@@ -515,6 +571,51 @@ export class PgVectorService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get current similarity threshold configuration
|
||||||
|
*/
|
||||||
|
getSimilarityThreshold(): number {
|
||||||
|
return parseFloat(process.env.VECTOR_SIMILARITY_THRESHOLD || '0.85');
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get current embedding model configuration
|
||||||
|
*/
|
||||||
|
getEmbeddingModel(): string {
|
||||||
|
return process.env.OPENAI_API_MODEL || 'text-embedding-ada-002';
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set similarity threshold (for runtime configuration)
|
||||||
|
*/
|
||||||
|
setSimilarityThreshold(threshold: number): void {
|
||||||
|
if (threshold < 0 || threshold > 1) {
|
||||||
|
throw new Error('Similarity threshold must be between 0 and 1');
|
||||||
|
}
|
||||||
|
process.env.VECTOR_SIMILARITY_THRESHOLD = threshold.toString();
|
||||||
|
this.logger.log(`Similarity threshold updated to: ${threshold}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set embedding model (for runtime configuration)
|
||||||
|
*/
|
||||||
|
async setEmbeddingModel(modelName: string): Promise<void> {
|
||||||
|
if (!modelName || typeof modelName !== 'string') {
|
||||||
|
throw new Error('Model name must be a valid string');
|
||||||
|
}
|
||||||
|
|
||||||
|
const currentModel = this.getEmbeddingModel();
|
||||||
|
if (modelName === currentModel) {
|
||||||
|
this.logger.log(`Model ${modelName} is already active`);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
this.logger.log(
|
||||||
|
`Switching embedding model from ${currentModel} to ${modelName}`,
|
||||||
|
);
|
||||||
|
await this.reinitializeEmbeddings(modelName);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get embedding statistics
|
* Get embedding statistics
|
||||||
*/
|
*/
|
||||||
@@ -524,6 +625,7 @@ export class PgVectorService {
|
|||||||
withoutEmbeddings: number;
|
withoutEmbeddings: number;
|
||||||
percentage: number;
|
percentage: number;
|
||||||
vectorStoreStatus: string;
|
vectorStoreStatus: string;
|
||||||
|
currentThreshold: number;
|
||||||
}> {
|
}> {
|
||||||
try {
|
try {
|
||||||
// Use raw SQL to get embedding statistics
|
// Use raw SQL to get embedding statistics
|
||||||
@@ -548,6 +650,7 @@ export class PgVectorService {
|
|||||||
withoutEmbeddings,
|
withoutEmbeddings,
|
||||||
percentage: Math.round(percentage * 100) / 100,
|
percentage: Math.round(percentage * 100) / 100,
|
||||||
vectorStoreStatus,
|
vectorStoreStatus,
|
||||||
|
currentThreshold: this.getSimilarityThreshold(),
|
||||||
};
|
};
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
this.logger.error('Error getting embedding stats:', error);
|
this.logger.error('Error getting embedding stats:', error);
|
||||||
@@ -569,6 +672,95 @@ export class PgVectorService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Advanced vector search dengan multiple similarity metrics untuk mendapatkan hasil yang lebih akurat
|
||||||
|
*/
|
||||||
|
async advancedVectorSearch(
|
||||||
|
query: string,
|
||||||
|
limit: number = 10,
|
||||||
|
category?: string,
|
||||||
|
threshold?: number,
|
||||||
|
): Promise<VectorSearchResult[]> {
|
||||||
|
try {
|
||||||
|
// Get threshold from environment variable or use default
|
||||||
|
const defaultThreshold = parseFloat(
|
||||||
|
process.env.VECTOR_SIMILARITY_THRESHOLD || '0.85',
|
||||||
|
);
|
||||||
|
const similarityThreshold = threshold || defaultThreshold;
|
||||||
|
|
||||||
|
this.logger.log(
|
||||||
|
`Performing advanced vector search for: ${query} with threshold: ${similarityThreshold}`,
|
||||||
|
);
|
||||||
|
|
||||||
|
if (!this.embeddings) {
|
||||||
|
throw new Error('OpenAI embeddings not initialized');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Generate embedding for query
|
||||||
|
const queryEmbedding = await this.generateEmbedding(query);
|
||||||
|
const vectorString = `[${queryEmbedding.join(',')}]`;
|
||||||
|
|
||||||
|
// Advanced SQL query using multiple similarity metrics
|
||||||
|
let sql = `
|
||||||
|
SELECT
|
||||||
|
id, code, display, version, category,
|
||||||
|
(1 - (embedding <=> $1::vector)) as cosine_similarity,
|
||||||
|
(1 - (embedding <-> $1::vector)) as euclidean_similarity,
|
||||||
|
(embedding <#> $1::vector) as negative_inner_product
|
||||||
|
FROM icd_codes
|
||||||
|
WHERE embedding IS NOT NULL
|
||||||
|
`;
|
||||||
|
|
||||||
|
const params: any[] = [vectorString];
|
||||||
|
let paramIndex = 2;
|
||||||
|
|
||||||
|
if (category) {
|
||||||
|
sql += ` AND category = $${paramIndex}`;
|
||||||
|
params.push(category);
|
||||||
|
paramIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Filter by cosine similarity threshold
|
||||||
|
sql += ` AND (1 - (embedding <=> $1::vector)) >= $${paramIndex}`;
|
||||||
|
params.push(similarityThreshold);
|
||||||
|
paramIndex++;
|
||||||
|
|
||||||
|
// Order by combined similarity score and limit
|
||||||
|
sql += ` ORDER BY cosine_similarity DESC, euclidean_similarity DESC LIMIT $${paramIndex}`;
|
||||||
|
params.push(limit);
|
||||||
|
|
||||||
|
const result = await this.pool.query(sql, params);
|
||||||
|
|
||||||
|
// Transform results with enhanced similarity scoring
|
||||||
|
const filteredResults: VectorSearchResult[] = result.rows.map(
|
||||||
|
(row: any) => {
|
||||||
|
const cosineSim = parseFloat(row.cosine_similarity);
|
||||||
|
const euclideanSim = parseFloat(row.euclidean_similarity);
|
||||||
|
|
||||||
|
// Calculate combined similarity score (weighted average)
|
||||||
|
const combinedSimilarity = cosineSim * 0.7 + euclideanSim * 0.3;
|
||||||
|
|
||||||
|
return {
|
||||||
|
id: row.id,
|
||||||
|
code: row.code,
|
||||||
|
display: row.display,
|
||||||
|
version: row.version,
|
||||||
|
category: row.category,
|
||||||
|
similarity: Math.round(combinedSimilarity * 1000) / 1000, // Round to 3 decimal places
|
||||||
|
};
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
this.logger.log(
|
||||||
|
`Advanced vector search returned ${filteredResults.length} results for query: "${query}" with threshold: ${similarityThreshold}`,
|
||||||
|
);
|
||||||
|
return filteredResults;
|
||||||
|
} catch (error) {
|
||||||
|
this.logger.error('Error in advanced vector search:', error);
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get vector store status
|
* Get vector store status
|
||||||
*/
|
*/
|
||||||
@@ -589,9 +781,10 @@ export class PgVectorService {
|
|||||||
initialized: !!this.vectorStore,
|
initialized: !!this.vectorStore,
|
||||||
documentCount,
|
documentCount,
|
||||||
embeddingModel: this.embeddings
|
embeddingModel: this.embeddings
|
||||||
? `OpenAI ${process.env.OPENAI_API_MODEL || 'text-embedding-ada-002'}`
|
? `OpenAI ${this.getEmbeddingModel()}`
|
||||||
: 'Not Available',
|
: 'Not Available',
|
||||||
lastUpdated: new Date(),
|
lastUpdated: new Date(),
|
||||||
|
currentThreshold: this.getSimilarityThreshold(),
|
||||||
};
|
};
|
||||||
|
|
||||||
return status;
|
return status;
|
||||||
@@ -601,6 +794,164 @@ export class PgVectorService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create enhanced text representation for better embedding quality
|
||||||
|
*/
|
||||||
|
private createEnhancedTextRepresentation(code: any): string {
|
||||||
|
// Base text with code and display
|
||||||
|
let text = `${code.code} ${code.display}`;
|
||||||
|
|
||||||
|
// Add category context
|
||||||
|
if (code.category === 'ICD9') {
|
||||||
|
text += ` ICD-9 CM procedure diagnosis`;
|
||||||
|
} else if (code.category === 'ICD10') {
|
||||||
|
text += ` ICD-10 diagnosis condition`;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add version context
|
||||||
|
if (code.version) {
|
||||||
|
text += ` ${code.version}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add medical context based on display content
|
||||||
|
const display = code.display.toLowerCase();
|
||||||
|
|
||||||
|
// Add procedure context
|
||||||
|
if (
|
||||||
|
display.includes('procedure') ||
|
||||||
|
display.includes('surgery') ||
|
||||||
|
display.includes('operation')
|
||||||
|
) {
|
||||||
|
text += ' medical procedure surgical intervention';
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add diagnosis context
|
||||||
|
if (
|
||||||
|
display.includes('diagnosis') ||
|
||||||
|
display.includes('condition') ||
|
||||||
|
display.includes('disease')
|
||||||
|
) {
|
||||||
|
text += ' medical diagnosis clinical condition';
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add anatomical context
|
||||||
|
if (
|
||||||
|
display.includes('cranial') ||
|
||||||
|
display.includes('brain') ||
|
||||||
|
display.includes('head')
|
||||||
|
) {
|
||||||
|
text += ' neurological cranial brain head';
|
||||||
|
}
|
||||||
|
|
||||||
|
if (
|
||||||
|
display.includes('cardiac') ||
|
||||||
|
display.includes('heart') ||
|
||||||
|
display.includes('cardiovascular')
|
||||||
|
) {
|
||||||
|
text += ' cardiac heart cardiovascular';
|
||||||
|
}
|
||||||
|
|
||||||
|
if (
|
||||||
|
display.includes('pulmonary') ||
|
||||||
|
display.includes('lung') ||
|
||||||
|
display.includes('respiratory')
|
||||||
|
) {
|
||||||
|
text += ' pulmonary lung respiratory';
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add common medical terms
|
||||||
|
text += ' medical healthcare clinical';
|
||||||
|
|
||||||
|
return text;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Regenerate embeddings with enhanced text representation for better similarity
|
||||||
|
*/
|
||||||
|
async regenerateEmbeddingsWithEnhancedText(limit: number = 100): Promise<{
|
||||||
|
processed: number;
|
||||||
|
errors: number;
|
||||||
|
totalSample: number;
|
||||||
|
}> {
|
||||||
|
try {
|
||||||
|
this.logger.log(
|
||||||
|
`Starting enhanced embedding regeneration for ${limit} ICD codes...`,
|
||||||
|
);
|
||||||
|
|
||||||
|
// Get ICD codes with existing embeddings to regenerate
|
||||||
|
const codesWithEmbeddings = await this.pool.query(
|
||||||
|
'SELECT id, code, display, version, category FROM icd_codes WHERE embedding IS NOT NULL LIMIT $1',
|
||||||
|
[limit],
|
||||||
|
);
|
||||||
|
|
||||||
|
if (codesWithEmbeddings.rows.length === 0) {
|
||||||
|
this.logger.log('No ICD codes found with embeddings to regenerate');
|
||||||
|
return { processed: 0, errors: 0, totalSample: 0 };
|
||||||
|
}
|
||||||
|
|
||||||
|
this.logger.log(
|
||||||
|
`Found ${codesWithEmbeddings.rows.length} codes to regenerate with enhanced text`,
|
||||||
|
);
|
||||||
|
|
||||||
|
let processed = 0;
|
||||||
|
let errors = 0;
|
||||||
|
|
||||||
|
// Process each code
|
||||||
|
for (let i = 0; i < codesWithEmbeddings.rows.length; i++) {
|
||||||
|
const code = codesWithEmbeddings.rows[i];
|
||||||
|
try {
|
||||||
|
// Create enhanced text representation for better embedding quality
|
||||||
|
const text = this.createEnhancedTextRepresentation(code);
|
||||||
|
|
||||||
|
// Generate new embedding
|
||||||
|
const embedding = await this.generateEmbedding(text);
|
||||||
|
|
||||||
|
// Convert embedding array to proper vector format for pgvector
|
||||||
|
const vectorString = `[${embedding.join(',')}]`;
|
||||||
|
|
||||||
|
// Update database with new embedding and enhanced content
|
||||||
|
await this.pool.query(
|
||||||
|
`UPDATE icd_codes
|
||||||
|
SET embedding = $1::vector,
|
||||||
|
content = $2,
|
||||||
|
"updatedAt" = NOW()
|
||||||
|
WHERE id = $3`,
|
||||||
|
[vectorString, text, code.id],
|
||||||
|
);
|
||||||
|
|
||||||
|
processed++;
|
||||||
|
|
||||||
|
if (processed % 10 === 0) {
|
||||||
|
this.logger.log(
|
||||||
|
`Regenerated ${processed}/${codesWithEmbeddings.rows.length} enhanced embeddings`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
this.logger.error(
|
||||||
|
`Error regenerating embedding for code ${code.code}:`,
|
||||||
|
error,
|
||||||
|
);
|
||||||
|
errors++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
this.logger.log(
|
||||||
|
`Enhanced embedding regeneration completed. Processed: ${processed}, Errors: ${errors}, Total: ${codesWithEmbeddings.rows.length}`,
|
||||||
|
);
|
||||||
|
return {
|
||||||
|
processed,
|
||||||
|
errors,
|
||||||
|
totalSample: codesWithEmbeddings.rows.length,
|
||||||
|
};
|
||||||
|
} catch (error) {
|
||||||
|
this.logger.error(
|
||||||
|
'Error in regenerateEmbeddingsWithEnhancedText:',
|
||||||
|
error,
|
||||||
|
);
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Cleanup resources
|
* Cleanup resources
|
||||||
*/
|
*/
|
||||||
|
|||||||
Reference in New Issue
Block a user