Jinbaモジュールを使用した高度なデータ抽出、解析、検証
- id: extract_structured_data
name: extract_structured_data
tool: JINBA_MODULES_EXTRACT
input:
- name: task_name
value: "請求書データ抽出"
- name: file_url
value: "{{steps.upload_document.result.file_url}}"
- name: data_schema
value: |
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"properties": {
"invoice_number": {
"type": "string",
"description": "請求書番号またはID"
},
"date": {
"type": "string",
"format": "date",
"description": "請求書日付"
},
"vendor": {
"type": "object",
"properties": {
"name": {"type": "string"},
"address": {"type": "string"},
"phone": {"type": "string"},
"email": {"type": "string"}
}
},
"items": {
"type": "array",
"items": {
"type": "object",
"properties": {
"description": {"type": "string"},
"quantity": {"type": "number"},
"unit_price": {"type": "number"},
"total": {"type": "number"}
}
}
},
"total_amount": {
"type": "number",
"description": "合計請求金額"
},
"tax_amount": {
"type": "number",
"description": "税額(存在する場合)"
}
},
"required": ["invoice_number", "date", "total_amount"]
}
- name: extraction_mode
value: "QUALITY" # オプション: FAST、BALANCED、QUALITY
- id: validate_extracted_data
name: validate_extracted_data
tool: JINBA_MODULES_CHECKER_V2
input:
- name: file_url
value: "{{steps.extract_structured_data.result.file_url}}"
- name: rules_json
value: |
{
"validation_rules": [
{
"field": "invoice_number",
"type": "required",
"error_message": "請求書番号は必須です"
},
{
"field": "total_amount",
"type": "number",
"min": 0,
"error_message": "合計金額は正の数である必要があります"
},
{
"field": "date",
"type": "date",
"format": "YYYY-MM-DD",
"error_message": "日付は有効な形式である必要があります"
},
{
"field": "vendor.email",
"type": "email",
"required": false,
"error_message": "提供された場合、メールは有効な形式である必要があります"
}
]
}
- id: process_extraction_results
name: process_extraction_results
tool: PYTHON_SANDBOX_RUN
input:
- name: code
value: |
import json
# 抽出結果を処理
extracted_data = json.loads('''{{steps.extract_structured_data.result.extracted_data}}''')
validation_results = json.loads('''{{steps.validate_extracted_data.result.validation_results}}''')
print("文書抽出結果")
print("=" * 20)
# 抽出されたデータを表示
print("📄 抽出された情報:")
print(f"請求書番号: {extracted_data.get('invoice_number', 'N/A')}")
print(f"日付: {extracted_data.get('date', 'N/A')}")
print(f"ベンダー: {extracted_data.get('vendor', {}).get('name', 'N/A')}")
print(f"合計金額: ¥{extracted_data.get('total_amount', 0):,.2f}")
if 'items' in extracted_data:
print(f"アイテム数: {len(extracted_data['items'])}")
print("\n🔍 検証結果:")
valid_count = sum(1 for r in validation_results if r.get('status') == 'valid')
total_rules = len(validation_results)
print(f"有効: {valid_count}/{total_rules}")
# 検証エラーを表示
errors = [r for r in validation_results if r.get('status') == 'invalid']
if errors:
print("\n❌ 検証エラー:")
for error in errors:
print(f" - {error.get('field', '不明')}: {error.get('message', '不明なエラー')}")
else:
print("✅ すべての検証に合格しました")
- id: export_processed_data
name: export_processed_data
tool: OUTPUT_FILE
input:
- name: content
value: "{{steps.extract_structured_data.result.extracted_data}}"
- name: filename
value: "extracted_invoice_data_{{date | format('YYYY-MM-DD')}}.json"
- name: fileType
value: "json"
- id: setup_batch_processing
name: setup_batch_processing
tool: PYTHON_SANDBOX_RUN
input:
- name: code
value: |
# バッチ処理設定を定義
batch_config = {
"document_types": ["invoice", "receipt", "contract"],
"extraction_schema": {
"common_fields": ["date", "amount", "vendor", "document_type"],
"invoice_fields": ["invoice_number", "line_items", "tax_amount"],
"receipt_fields": ["merchant", "payment_method", "receipt_number"],
"contract_fields": ["parties", "terms", "effective_date", "expiration_date"]
},
"validation_rules": {
"amount_validation": {"type": "number", "min": 0},
"date_validation": {"type": "date", "format": "flexible"},
"email_validation": {"type": "email", "required": false}
}
}
print("バッチ処理設定が完了しました。文書タイプ:")
for doc_type in batch_config["document_types"]:
print(f" - {doc_type.title()}")
- id: process_document_batch
name: process_document_batch
tool: JINBA_MODULES_EXTRACT
input:
- name: task_name
value: "バッチ文書処理"
- name: file_url
value: "{{input.batch_file_url}}"
- name: data_schema
value: |
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"properties": {
"documents": {
"type": "array",
"items": {
"type": "object",
"properties": {
"document_type": {"type": "string"},
"date": {"type": "string"},
"amount": {"type": "number"},
"vendor": {"type": "string"},
"metadata": {
"type": "object",
"additionalProperties": true
}
},
"required": ["document_type", "date", "amount"]
}
}
}
}
- name: extraction_mode
value: "BALANCED"
- id: parse_complex_structures
name: parse_complex_structures
tool: JINBA_MODULES_PARSE
input:
- name: input_data
value: "{{steps.process_document_batch.result.extracted_data}}"
- name: parsing_options
value: |
{
"preserve_structure": true,
"normalize_dates": true,
"standardize_amounts": true,
"extract_entities": true,
"group_by_type": true
}
- id: comprehensive_validation
name: comprehensive_validation
tool: JINBA_MODULES_CHECKER_V2
input:
- name: data_content
value: "{{steps.parse_complex_structures.result.parsed_data}}"
- name: rules_json
value: |
{
"validation_rules": [
{
"field": "documents[*].document_type",
"type": "enum",
"values": ["invoice", "receipt", "contract"],
"error_message": "文書タイプは invoice、receipt、contract のいずれかである必要があります"
},
{
"field": "documents[*].amount",
"type": "number",
"min": 0,
"max": 1000000,
"error_message": "金額は0から1,000,000の間である必要があります"
},
{
"field": "documents[*].date",
"type": "date",
"min_date": "2020-01-01",
"max_date": "2025-12-31",
"error_message": "日付は2020年から2025年の間である必要があります"
},
{
"field": "documents[*].vendor",
"type": "string",
"min_length": 2,
"max_length": 200,
"error_message": "ベンダー名は2-200文字である必要があります"
}
],
"summary_rules": [
{
"rule": "document_count_check",
"expression": "documents.length > 0",
"error_message": "少なくとも1つの文書が処理される必要があります"
},
{
"rule": "total_amount_check",
"expression": "sum(documents[*].amount) > 0",
"error_message": "合計金額はゼロより大きい必要があります"
}
]
}
- id: generate_processing_report
name: generate_processing_report
tool: PYTHON_SANDBOX_RUN
input:
- name: code
value: |
import json
from datetime import datetime
# 処理レポートをコンパイル
extracted = json.loads('''{{steps.process_document_batch.result.extracted_data}}''')
parsed = json.loads('''{{steps.parse_complex_structures.result.parsed_data}}''')
validation = json.loads('''{{steps.comprehensive_validation.result.validation_results}}''')
report = {
"processing_summary": {
"timestamp": datetime.now().isoformat(),
"total_documents": len(extracted.get('documents', [])),
"extraction_mode": "BALANCED",
"validation_passed": all(r.get('status') == 'valid' for r in validation)
},
"document_breakdown": {},
"validation_summary": {
"total_rules": len(validation),
"passed": sum(1 for r in validation if r.get('status') == 'valid'),
"failed": sum(1 for r in validation if r.get('status') == 'invalid')
},
"recommendations": []
}
# 文書タイプ別内訳
if 'documents' in extracted:
doc_types = {}
total_amount = 0
for doc in extracted['documents']:
doc_type = doc.get('document_type', 'unknown')
doc_types[doc_type] = doc_types.get(doc_type, 0) + 1
total_amount += doc.get('amount', 0)
report['document_breakdown'] = doc_types
report['processing_summary']['total_amount'] = total_amount
# 推奨事項を追加
if report['validation_summary']['failed'] > 0:
report['recommendations'].append("失敗した検証を確認し、データの問題を修正してください")
if report['processing_summary']['total_documents'] > 100:
report['recommendations'].append("パフォーマンス向上のため、より小さなバッチでの処理を検討してください")
print(json.dumps(report, indent=2, ensure_ascii=False))
- id: save_processing_report
name: save_processing_report
tool: OUTPUT_FILE
input:
- name: content
value: "{{steps.generate_processing_report.result.stdout}}"
- name: filename
value: "batch_processing_report_{{date | format('YYYY-MM-DD-HHmm')}}.json"
- name: fileType
value: "json"
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"properties": {
"field_name": {
"type": "string|number|object|array",
"description": "フィールドの明確な説明",
"format": "date|email|uri|etc",
"pattern": "必要に応じて正規表現パターン"
}
},
"required": ["必須フィールドのリスト"]
}