import { createFlow, parse, chunk, extract, combine } from '@doclo/flows';
import { createVLMProvider } from '@doclo/providers-llm';
import { createOCRProvider } from '@doclo/providers-datalab';
import { readFileSync } from 'fs';
// Types
interface ContractClause {
clauseNumber: string;
title: string;
content: string;
pageNumber: number;
type: 'obligation' | 'right' | 'definition' | 'general';
}
interface ContractExtraction {
clauses: ContractClause[];
}
// Schema
const clauseSchema = {
type: 'object',
properties: {
clauses: {
type: 'array',
items: {
type: 'object',
properties: {
clauseNumber: { type: 'string', description: 'Clause number (e.g., 1.1, 2.3)' },
title: { type: 'string', description: 'Clause title or heading' },
content: { type: 'string', description: 'Full clause text' },
pageNumber: { type: 'number', description: 'Page where clause appears' },
type: {
type: 'string',
enum: ['obligation', 'right', 'definition', 'general'],
description: 'Clause type classification'
}
},
required: ['clauseNumber', 'content', 'type']
}
}
}
};
// Providers
const ocrProvider = createOCRProvider({
endpoint: 'https://www.datalab.to/api/v1/marker',
apiKey: process.env.DATALAB_API_KEY!
});
const vlmProvider = createVLMProvider({
provider: 'anthropic',
model: 'anthropic/claude-sonnet-4.5',
apiKey: process.env.OPENROUTER_API_KEY!,
via: 'openrouter'
});
// Build flow
const contractFlow = createFlow({
observability: {
onBatchStart: (ctx) => {
console.log(`Processing ${ctx.totalItems} chunks...`);
},
onBatchItemEnd: (ctx) => {
console.log(`Chunk ${ctx.itemIndex + 1} complete`);
},
onBatchEnd: (ctx) => {
console.log(`All chunks processed: ${ctx.successfulItems} success, ${ctx.failedItems} failed`);
}
}
})
.step('parse', parse({ provider: ocrProvider }))
.step('chunk', chunk({
strategy: 'section', // Split by sections/headers
maxSize: 3000,
minSize: 200
}))
.forEach('extract', () =>
createFlow()
.step('extract', extract<ContractExtraction>({
provider: vlmProvider,
schema: clauseSchema,
inputMode: 'ir',
additionalInstructions: `
Extract all contract clauses from this section.
Include the exact page number where each clause starts.
Classify each clause by its primary purpose.
`
}))
)
.step('combine', combine({ strategy: 'merge' }))
.build();
// Process contract
async function processContract(filePath: string) {
const fileBuffer = readFileSync(filePath);
const base64 = `data:application/pdf;base64,${fileBuffer.toString('base64')}`;
const result = await contractFlow.run({ base64 });
console.log('\n--- Contract Analysis ---');
console.log('Total clauses found:', result.output.clauses?.length ?? 0);
// Group by type
const byType = result.output.clauses?.reduce((acc, clause) => {
acc[clause.type] = (acc[clause.type] || 0) + 1;
return acc;
}, {} as Record<string, number>);
console.log('By type:', byType);
console.log('\n--- Metrics ---');
console.log('Duration:', result.aggregated.totalDurationMs, 'ms');
console.log('Cost: $', result.aggregated.totalCostUSD.toFixed(4));
return result.output;
}
processContract('./contract.pdf').catch(console.error);