300 lines
8.1 KiB
JavaScript
300 lines
8.1 KiB
JavaScript
|
|
const multer = require('multer');
|
||
|
|
const path = require('path');
|
||
|
|
const fs = require('fs');
|
||
|
|
const pdf = require('pdf-parse');
|
||
|
|
const { Document, sequelize } = require('../models');
|
||
|
|
const { Op } = require('sequelize');
|
||
|
|
const logger = require('../utils/logger');
|
||
|
|
const embeddingService = require('../services/embeddingService');
|
||
|
|
const graphRagService = require('../services/graphRagService');
|
||
|
|
|
||
|
|
// Configure multer for file uploads
|
||
|
|
const storage = multer.diskStorage({
|
||
|
|
destination: (req, file, cb) => {
|
||
|
|
const uploadPath = path.join(__dirname, '../../uploads');
|
||
|
|
if (!fs.existsSync(uploadPath)) {
|
||
|
|
fs.mkdirSync(uploadPath, { recursive: true });
|
||
|
|
}
|
||
|
|
cb(null, uploadPath);
|
||
|
|
},
|
||
|
|
filename: (req, file, cb) => {
|
||
|
|
const uniqueSuffix = Date.now() + '-' + Math.round(Math.random() * 1E9);
|
||
|
|
cb(null, file.fieldname + '-' + uniqueSuffix + path.extname(file.originalname));
|
||
|
|
}
|
||
|
|
});
|
||
|
|
|
||
|
|
const upload = multer({
|
||
|
|
storage: storage,
|
||
|
|
limits: {
|
||
|
|
fileSize: 10 * 1024 * 1024, // 10MB for testing
|
||
|
|
fieldSize: 10 * 1024 * 1024, // 10MB for field values
|
||
|
|
fieldNameSize: 100, // 100 bytes for field names
|
||
|
|
files: 1 // Only 1 file at a time
|
||
|
|
},
|
||
|
|
fileFilter: (req, file, cb) => {
|
||
|
|
const allowedTypes = ['.pdf', '.txt', '.doc', '.docx'];
|
||
|
|
const ext = path.extname(file.originalname).toLowerCase();
|
||
|
|
|
||
|
|
if (allowedTypes.includes(ext)) {
|
||
|
|
cb(null, true);
|
||
|
|
} else {
|
||
|
|
cb(new Error('Invalid file type. Only PDF, TXT, DOC, DOCX files are allowed.'));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
});
|
||
|
|
|
||
|
|
const uploadDocument = async (req, res) => {
|
||
|
|
try {
|
||
|
|
if (!req.file) {
|
||
|
|
return res.status(400).json({
|
||
|
|
success: false,
|
||
|
|
error: 'No file uploaded'
|
||
|
|
});
|
||
|
|
}
|
||
|
|
|
||
|
|
const { category, tags } = req.body;
|
||
|
|
let extractedText = '';
|
||
|
|
|
||
|
|
// Extract text from PDF
|
||
|
|
if (req.file.mimetype === 'application/pdf') {
|
||
|
|
try {
|
||
|
|
const dataBuffer = fs.readFileSync(req.file.path);
|
||
|
|
const pdfData = await pdf(dataBuffer);
|
||
|
|
extractedText = pdfData.text;
|
||
|
|
} catch (error) {
|
||
|
|
logger.error('PDF extraction error:', error);
|
||
|
|
extractedText = 'Error extracting text from PDF';
|
||
|
|
}
|
||
|
|
} else if (req.file.mimetype === 'text/plain') {
|
||
|
|
extractedText = fs.readFileSync(req.file.path, 'utf8');
|
||
|
|
}
|
||
|
|
|
||
|
|
// Create document record
|
||
|
|
const document = await Document.create({
|
||
|
|
filename: req.file.filename,
|
||
|
|
original_filename: req.file.originalname,
|
||
|
|
file_path: req.file.path,
|
||
|
|
file_type: req.file.mimetype,
|
||
|
|
file_size: req.file.size,
|
||
|
|
content: extractedText,
|
||
|
|
extracted_text: extractedText,
|
||
|
|
category: category || 'general',
|
||
|
|
tags: tags ? tags.split(',').map(tag => tag.trim()) : [],
|
||
|
|
indexing_status: 'processing'
|
||
|
|
});
|
||
|
|
|
||
|
|
// Generate and store embeddings (if text available)
|
||
|
|
if (extractedText && extractedText.trim().length > 0) {
|
||
|
|
try {
|
||
|
|
const embedding = await embeddingService.embedText(extractedText.slice(0, 15000));
|
||
|
|
await document.update({ embeddings: embedding, is_indexed: true, indexing_status: 'completed' });
|
||
|
|
} catch (e) {
|
||
|
|
logger.error('Embedding generation failed:', e);
|
||
|
|
await document.update({ is_indexed: false, indexing_status: 'failed' });
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
await document.update({ is_indexed: false, indexing_status: 'failed' });
|
||
|
|
}
|
||
|
|
|
||
|
|
logger.info(`Document uploaded: ${document.id}`);
|
||
|
|
|
||
|
|
res.status(201).json({
|
||
|
|
success: true,
|
||
|
|
data: { document }
|
||
|
|
});
|
||
|
|
} catch (error) {
|
||
|
|
logger.error('Upload document error:', error);
|
||
|
|
|
||
|
|
// Clean up uploaded file if document creation failed
|
||
|
|
if (req.file && fs.existsSync(req.file.path)) {
|
||
|
|
fs.unlinkSync(req.file.path);
|
||
|
|
}
|
||
|
|
|
||
|
|
res.status(500).json({
|
||
|
|
success: false,
|
||
|
|
error: 'Internal server error'
|
||
|
|
});
|
||
|
|
}
|
||
|
|
};
|
||
|
|
|
||
|
|
const getDocuments = async (req, res) => {
|
||
|
|
try {
|
||
|
|
const { page = 1, limit = 10, category, search, isIndexed } = req.query;
|
||
|
|
|
||
|
|
const whereClause = {};
|
||
|
|
if (category) whereClause.category = category;
|
||
|
|
if (isIndexed !== undefined) whereClause.is_indexed = isIndexed === 'true';
|
||
|
|
if (search) {
|
||
|
|
whereClause[Op.or] = [
|
||
|
|
{ original_filename: { [Op.iLike]: `%${search}%` } },
|
||
|
|
{ extracted_text: { [Op.iLike]: `%${search}%` } }
|
||
|
|
];
|
||
|
|
}
|
||
|
|
|
||
|
|
const documents = await Document.findAndCountAll({
|
||
|
|
where: whereClause,
|
||
|
|
order: [['created_at', 'DESC']],
|
||
|
|
limit: parseInt(limit),
|
||
|
|
offset: (parseInt(page) - 1) * parseInt(limit)
|
||
|
|
});
|
||
|
|
|
||
|
|
res.json({
|
||
|
|
success: true,
|
||
|
|
data: {
|
||
|
|
documents: documents.rows,
|
||
|
|
pagination: {
|
||
|
|
page: parseInt(page),
|
||
|
|
limit: parseInt(limit),
|
||
|
|
total: documents.count,
|
||
|
|
pages: Math.ceil(documents.count / parseInt(limit))
|
||
|
|
}
|
||
|
|
}
|
||
|
|
});
|
||
|
|
} catch (error) {
|
||
|
|
logger.error('Get documents error:', error);
|
||
|
|
res.status(500).json({
|
||
|
|
success: false,
|
||
|
|
error: 'Internal server error'
|
||
|
|
});
|
||
|
|
}
|
||
|
|
};
|
||
|
|
|
||
|
|
const getDocument = async (req, res) => {
|
||
|
|
try {
|
||
|
|
const { documentId } = req.params;
|
||
|
|
|
||
|
|
const document = await Document.findByPk(documentId);
|
||
|
|
|
||
|
|
if (!document) {
|
||
|
|
return res.status(404).json({
|
||
|
|
success: false,
|
||
|
|
error: 'Document not found'
|
||
|
|
});
|
||
|
|
}
|
||
|
|
|
||
|
|
res.json({
|
||
|
|
success: true,
|
||
|
|
data: { document }
|
||
|
|
});
|
||
|
|
} catch (error) {
|
||
|
|
logger.error('Get document error:', error);
|
||
|
|
res.status(500).json({
|
||
|
|
success: false,
|
||
|
|
error: 'Internal server error'
|
||
|
|
});
|
||
|
|
}
|
||
|
|
};
|
||
|
|
|
||
|
|
const searchDocuments = async (req, res) => {
|
||
|
|
try {
|
||
|
|
const { query, category, limit = 10 } = req.query;
|
||
|
|
|
||
|
|
if (!query) {
|
||
|
|
return res.status(400).json({
|
||
|
|
success: false,
|
||
|
|
error: 'Search query is required'
|
||
|
|
});
|
||
|
|
}
|
||
|
|
|
||
|
|
const whereClause = {
|
||
|
|
is_indexed: true,
|
||
|
|
...(category ? { category } : {})
|
||
|
|
};
|
||
|
|
|
||
|
|
// Embed query and compute cosine similarity in JS for now
|
||
|
|
const queryEmbedding = await embeddingService.embedText(query);
|
||
|
|
|
||
|
|
const candidates = await Document.findAll({
|
||
|
|
where: whereClause,
|
||
|
|
attributes: ['id', 'original_filename', 'extracted_text', 'embeddings', 'category', 'created_at']
|
||
|
|
});
|
||
|
|
|
||
|
|
const scored = [];
|
||
|
|
for (const doc of candidates) {
|
||
|
|
const emb = doc.embeddings || [];
|
||
|
|
const score = embeddingService.cosineSimilarity(queryEmbedding, emb);
|
||
|
|
scored.push({
|
||
|
|
id: doc.id,
|
||
|
|
original_filename: doc.original_filename,
|
||
|
|
snippet: (doc.extracted_text || '').slice(0, 300),
|
||
|
|
category: doc.category,
|
||
|
|
created_at: doc.created_at,
|
||
|
|
score
|
||
|
|
});
|
||
|
|
}
|
||
|
|
|
||
|
|
scored.sort((a, b) => b.score - a.score);
|
||
|
|
const top = scored.slice(0, parseInt(limit));
|
||
|
|
|
||
|
|
res.json({ success: true, data: { results: top } });
|
||
|
|
} catch (error) {
|
||
|
|
logger.error('Search documents error:', error);
|
||
|
|
res.status(500).json({
|
||
|
|
success: false,
|
||
|
|
error: 'Internal server error'
|
||
|
|
});
|
||
|
|
}
|
||
|
|
};
|
||
|
|
|
||
|
|
const graphSearchDocuments = async (req, res) => {
|
||
|
|
try {
|
||
|
|
const { query, category } = req.query;
|
||
|
|
|
||
|
|
if (!query) {
|
||
|
|
return res.status(400).json({ success: false, error: 'Search query is required' });
|
||
|
|
}
|
||
|
|
|
||
|
|
const result = await graphRagService.graphSearch({ query, category });
|
||
|
|
res.json({ success: true, data: result });
|
||
|
|
} catch (error) {
|
||
|
|
logger.error('Graph search error:', error);
|
||
|
|
res.status(500).json({ success: false, error: 'Internal server error' });
|
||
|
|
}
|
||
|
|
};
|
||
|
|
|
||
|
|
const deleteDocument = async (req, res) => {
|
||
|
|
try {
|
||
|
|
const { documentId } = req.params;
|
||
|
|
|
||
|
|
const document = await Document.findByPk(documentId);
|
||
|
|
|
||
|
|
if (!document) {
|
||
|
|
return res.status(404).json({
|
||
|
|
success: false,
|
||
|
|
error: 'Document not found'
|
||
|
|
});
|
||
|
|
}
|
||
|
|
|
||
|
|
// Delete physical file
|
||
|
|
if (fs.existsSync(document.file_path)) {
|
||
|
|
fs.unlinkSync(document.file_path);
|
||
|
|
}
|
||
|
|
|
||
|
|
// Delete database record
|
||
|
|
await document.destroy();
|
||
|
|
|
||
|
|
logger.info(`Document deleted: ${documentId}`);
|
||
|
|
|
||
|
|
res.json({
|
||
|
|
success: true,
|
||
|
|
message: 'Document deleted successfully'
|
||
|
|
});
|
||
|
|
} catch (error) {
|
||
|
|
logger.error('Delete document error:', error);
|
||
|
|
res.status(500).json({
|
||
|
|
success: false,
|
||
|
|
error: 'Internal server error'
|
||
|
|
});
|
||
|
|
}
|
||
|
|
};
|
||
|
|
|
||
|
|
module.exports = {
|
||
|
|
uploadDocument,
|
||
|
|
getDocuments,
|
||
|
|
getDocument,
|
||
|
|
searchDocuments,
|
||
|
|
graphSearchDocuments,
|
||
|
|
deleteDocument,
|
||
|
|
upload // Export multer middleware
|
||
|
|
};
|