- id: setup_batch_processing
name: setup_batch_processing
tool: PYTHON_SANDBOX_RUN
input:
- name: code
value: |
# Define batch processing configuration
batch_config = {
"document_types": ["invoice", "receipt", "contract"],
"extraction_schema": {
"common_fields": ["date", "amount", "vendor", "document_type"],
"invoice_fields": ["invoice_number", "line_items", "tax_amount"],
"receipt_fields": ["merchant", "payment_method", "receipt_number"],
"contract_fields": ["parties", "terms", "effective_date", "expiration_date"]
},
"validation_rules": {
"amount_validation": {"type": "number", "min": 0},
"date_validation": {"type": "date", "format": "flexible"},
"email_validation": {"type": "email", "required": false}
}
}
print("Batch processing configured for document types:")
for doc_type in batch_config["document_types"]:
print(f" - {doc_type.title()}")
- id: process_document_batch
name: process_document_batch
tool: JINBA_MODULES_EXTRACT
input:
- name: task_name
value: "Batch Document Processing"
- name: file_url
value: "{{input.batch_file_url}}"
- name: data_schema
value: |
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"properties": {
"documents": {
"type": "array",
"items": {
"type": "object",
"properties": {
"document_type": {"type": "string"},
"date": {"type": "string"},
"amount": {"type": "number"},
"vendor": {"type": "string"},
"metadata": {
"type": "object",
"additionalProperties": true
}
},
"required": ["document_type", "date", "amount"]
}
}
}
}
- name: extraction_mode
value: "BALANCED"
- id: parse_complex_structures
name: parse_complex_structures
tool: JINBA_MODULES_PARSE
input:
- name: input_data
value: "{{steps.process_document_batch.result.extracted_data}}"
- name: parsing_options
value: |
{
"preserve_structure": true,
"normalize_dates": true,
"standardize_amounts": true,
"extract_entities": true,
"group_by_type": true
}
- id: comprehensive_validation
name: comprehensive_validation
tool: JINBA_MODULES_CHECKER_V2
input:
- name: data_content
value: "{{steps.parse_complex_structures.result.parsed_data}}"
- name: rules_json
value: |
{
"validation_rules": [
{
"field": "documents[*].document_type",
"type": "enum",
"values": ["invoice", "receipt", "contract"],
"error_message": "Document type must be invoice, receipt, or contract"
},
{
"field": "documents[*].amount",
"type": "number",
"min": 0,
"max": 1000000,
"error_message": "Amount must be between 0 and 1,000,000"
},
{
"field": "documents[*].date",
"type": "date",
"min_date": "2020-01-01",
"max_date": "2025-12-31",
"error_message": "Date must be between 2020 and 2025"
},
{
"field": "documents[*].vendor",
"type": "string",
"min_length": 2,
"max_length": 200,
"error_message": "Vendor name must be 2-200 characters"
}
],
"summary_rules": [
{
"rule": "document_count_check",
"expression": "documents.length > 0",
"error_message": "At least one document must be processed"
},
{
"rule": "total_amount_check",
"expression": "sum(documents[*].amount) > 0",
"error_message": "Total amount must be greater than zero"
}
]
}
- id: generate_processing_report
name: generate_processing_report
tool: PYTHON_SANDBOX_RUN
input:
- name: code
value: |
import json
from datetime import datetime
# Compile processing report
extracted = json.loads('''{{steps.process_document_batch.result.extracted_data}}''')
parsed = json.loads('''{{steps.parse_complex_structures.result.parsed_data}}''')
validation = json.loads('''{{steps.comprehensive_validation.result.validation_results}}''')
report = {
"processing_summary": {
"timestamp": datetime.now().isoformat(),
"total_documents": len(extracted.get('documents', [])),
"extraction_mode": "BALANCED",
"validation_passed": all(r.get('status') == 'valid' for r in validation)
},
"document_breakdown": {},
"validation_summary": {
"total_rules": len(validation),
"passed": sum(1 for r in validation if r.get('status') == 'valid'),
"failed": sum(1 for r in validation if r.get('status') == 'invalid')
},
"recommendations": []
}
# Document type breakdown
if 'documents' in extracted:
doc_types = {}
total_amount = 0
for doc in extracted['documents']:
doc_type = doc.get('document_type', 'unknown')
doc_types[doc_type] = doc_types.get(doc_type, 0) + 1
total_amount += doc.get('amount', 0)
report['document_breakdown'] = doc_types
report['processing_summary']['total_amount'] = total_amount
# Add recommendations
if report['validation_summary']['failed'] > 0:
report['recommendations'].append("Review failed validations and correct data issues")
if report['processing_summary']['total_documents'] > 100:
report['recommendations'].append("Consider processing in smaller batches for better performance")
print(json.dumps(report, indent=2))
- id: save_processing_report
name: save_processing_report
tool: OUTPUT_FILE
input:
- name: content
value: "{{steps.generate_processing_report.result.stdout}}"
- name: filename
value: "batch_processing_report_{{date | format('YYYY-MM-DD-HHmm')}}.json"
- name: fileType
value: "json"