Update source-tracker.ts

This commit is contained in:
Nicolas
2025-01-28 15:20:22 -03:00
parent 04c6f511b5
commit 70562261bc
@@ -3,7 +3,7 @@ import { areMergeable } from "./merge-null-val-objs";
import { transformArrayToObject } from "./transform-array-to-obj"; import { transformArrayToObject } from "./transform-array-to-obj";
interface TransformedResult { interface TransformedResult {
transformed: { [key: string]: any[] }; transformed: { [key: string]: any[] } | any[];
url: string; url: string;
} }
@@ -23,56 +23,70 @@ export class SourceTracker {
* Transform raw extraction results into a format that preserves source information * Transform raw extraction results into a format that preserves source information
*/ */
transformResults(extractionResults: { extract: any; url: string }[], schema: any, withTransform: boolean = true) { transformResults(extractionResults: { extract: any; url: string }[], schema: any, withTransform: boolean = true) {
// First transform each result individually // Handle array outputs
if (Array.isArray(extractionResults[0]?.extract)) {
this.transformedResults = extractionResults.map(result => ({
transformed: result.extract,
url: result.url
}));
if (withTransform) {
// Combine all extracts to match original behavior
const combinedExtracts = extractionResults.map(r => r.extract).flat();
return combinedExtracts;
}
return this.transformedResults;
}
// Handle object outputs (original behavior)
this.transformedResults = extractionResults.map(result => ({ this.transformedResults = extractionResults.map(result => ({
transformed: transformArrayToObject(schema, [result.extract]), transformed: transformArrayToObject(schema, [result.extract]),
url: result.url url: result.url
})); }));
if (withTransform) { if (withTransform) {
// Then combine all extracts and transform them together to match original behavior // Then combine all extracts and transform them together to match original behavior
const combinedExtracts = extractionResults.map(r => r.extract); const combinedExtracts = extractionResults.map(r => r.extract);
return transformArrayToObject(schema, combinedExtracts); return transformArrayToObject(schema, combinedExtracts);
} }
return this.transformedResults; return this.transformedResults;
} }
/**
* Merge all transformed results into one object - this is now only used internally
*/
private mergeTransformedResults() {
return this.transformedResults.reduce((acc, curr) => {
Object.keys(curr.transformed).forEach(key => {
const value = curr.transformed[key];
if (!acc[key]) {
acc[key] = Array.isArray(value) ? [...value] : value;
} else if (Array.isArray(acc[key]) && Array.isArray(value)) {
acc[key].push(...value);
} else if (typeof acc[key] === 'object' && typeof value === 'object') {
acc[key] = { ...acc[key], ...value };
}
});
return acc;
}, {} as { [key: string]: any[] });
}
/** /**
* Track sources for each item before deduplication * Track sources for each item before deduplication
*/ */
trackPreDeduplicationSources(multiEntityResult: { [key: string]: any[] }) { trackPreDeduplicationSources(multiEntityResult: { [key: string]: any[] } | any[]) {
try { try {
Object.keys(multiEntityResult).forEach(key => { if (Array.isArray(multiEntityResult)) {
multiEntityResult[key].forEach((item: any) => { // Handle array outputs
const itemKey = JSON.stringify(item); multiEntityResult.forEach((item: any) => {
const matchingSources = this.transformedResults const itemKey = JSON.stringify(item);
.filter(result => const matchingSources = this.transformedResults
result.transformed[key]?.some((resultItem: any) => .filter(result =>
JSON.stringify(resultItem) === itemKey Array.isArray(result.transformed) &&
result.transformed.some((resultItem: any) =>
JSON.stringify(resultItem) === itemKey
)
) )
) .map(result => result.url);
.map(result => result.url); this.preDedupeSourceMap.set(itemKey, matchingSources);
this.preDedupeSourceMap.set(itemKey, matchingSources); });
}); } else {
}); // Handle object outputs (original behavior)
Object.keys(multiEntityResult).forEach(key => {
multiEntityResult[key].forEach((item: any) => {
const itemKey = JSON.stringify(item);
const matchingSources = this.transformedResults
.filter(result =>
result.transformed[key]?.some((resultItem: any) =>
JSON.stringify(resultItem) === itemKey
)
)
.map(result => result.url);
this.preDedupeSourceMap.set(itemKey, matchingSources);
});
});
}
} catch (error) { } catch (error) {
logger.error(`Failed to track pre-deduplication sources`, { error }); logger.error(`Failed to track pre-deduplication sources`, { error });
} }
@@ -82,35 +96,56 @@ export class SourceTracker {
* Map sources to final deduplicated/merged items * Map sources to final deduplicated/merged items
*/ */
mapSourcesToFinalItems( mapSourcesToFinalItems(
multiEntityResult: { [key: string]: any[] }, multiEntityResult: { [key: string]: any[] } | any[],
multiEntityKeys: string[] multiEntityKeys: string[]
): Record<string, string[]> { ): Record<string, string[]> {
try { try {
const sources: Record<string, string[]> = {}; const sources: Record<string, string[]> = {};
multiEntityKeys.forEach(key => { if (Array.isArray(multiEntityResult)) {
if (multiEntityResult[key] && Array.isArray(multiEntityResult[key])) { // Handle array outputs
multiEntityResult[key].forEach((item: any, finalIndex: number) => { multiEntityResult.forEach((item: any, finalIndex: number) => {
const sourceKey = `${key}[${finalIndex}]`; const sourceKey = `[${finalIndex}]`;
const itemSources = new Set<string>(); const itemSources = new Set<string>();
this.transformedResults.forEach(result => { this.transformedResults.forEach(result => {
result.transformed[key]?.forEach((originalItem: any) => { if (Array.isArray(result.transformed)) {
if (areMergeable(item, originalItem)) { result.transformed.forEach((originalItem: any) => {
itemSources.add(result.url); if (areMergeable(item, originalItem)) {
} itemSources.add(result.url);
}); }
});
}
}); });
sources[sourceKey] = Array.from(itemSources); sources[sourceKey] = Array.from(itemSources);
}); });
} } else {
}); // Handle object outputs (original behavior)
multiEntityKeys.forEach(key => {
if (multiEntityResult[key] && Array.isArray(multiEntityResult[key])) {
multiEntityResult[key].forEach((item: any, finalIndex: number) => {
const sourceKey = `${key}[${finalIndex}]`;
const itemSources = new Set<string>();
return sources; this.transformedResults.forEach(result => {
} catch (error) { result.transformed[key]?.forEach((originalItem: any) => {
logger.error(`Failed to map sources to final items`, { error }); if (areMergeable(item, originalItem)) {
return {}; itemSources.add(result.url);
}
});
});
sources[sourceKey] = Array.from(itemSources);
});
}
});
}
return sources;
} catch (error) {
logger.error(`Failed to map sources to final items`, { error });
return {};
}
} }
}
} }