English
Extract and process text from various document formats
PDF_EXTRACT_TEXT
DOCX_EXTRACT_TEXT
WORD_TABLE_EXTRACT
WORD_TABLE_UPDATE
- id: extract_pdf_text name: extract_pdf_text tool: PDF_EXTRACT_TEXT input: - name: base64_file value: "{{steps.upload_pdf.result.base64}}" - id: extract_docx_text name: extract_docx_text tool: DOCX_EXTRACT_TEXT input: - name: base64_file value: "{{steps.upload_docx.result.base64}}" - id: analyze_extracted_text name: analyze_extracted_text tool: OPENAI_INVOKE config: - name: version value: gpt-4 input: - name: prompt value: | Please analyze the following extracted text and provide: 1. A summary of the main topics 2. Key findings or important points 3. Any action items mentioned PDF Content: {{steps.extract_pdf_text.result.text}} DOCX Content: {{steps.extract_docx_text.result.text}}
- id: extract_word_table name: extract_word_table tool: WORD_TABLE_EXTRACT config: - name: timeout value: 300000 input: - name: file_url value: "https://example.com/document.docx" - name: table_index value: 0 - id: process_table_data name: process_table_data tool: PYTHON_SANDBOX_RUN input: - name: script value: | import json # Get table data from previous step table_data = {{steps.extract_word_table.result.table}} # Process the table data processed_data = [] headers = table_data[0] if table_data else [] for row in table_data[1:]: # Skip header row row_dict = {} for i, cell in enumerate(row): if i < len(headers): row_dict[headers[i]] = cell processed_data.append(row_dict) print(json.dumps({"processed_table": processed_data})) - id: save_to_spreadsheet name: save_to_spreadsheet tool: GOOGLE_SPREADSHEET_ADD_ROWS config: - name: credentials value: "{{secrets.GOOGLE_SHEETS_CREDENTIALS}}" input: - name: spreadsheet_id value: "your_spreadsheet_id" - name: sheet_name value: "ExtractedData" - name: values value: "{{steps.extract_word_table.result.table}}"
- id: upload_document name: upload_document tool: INPUT_FILE input: - name: value value: "Document to analyze" - id: determine_file_type name: determine_file_type tool: PYTHON_SANDBOX_RUN input: - name: script value: | import json import base64 # Get file info file_info = {{steps.upload_document.result}} file_name = file_info.get('filename', '').lower() if file_name.endswith('.pdf'): file_type = 'pdf' elif file_name.endswith('.docx'): file_type = 'docx' else: file_type = 'unknown' print(json.dumps({"file_type": file_type, "filename": file_name})) - id: extract_text_pdf name: extract_text_pdf tool: PDF_EXTRACT_TEXT condition: "{{steps.determine_file_type.result.file_type == 'pdf'}}" input: - name: base64_file value: "{{steps.upload_document.result.base64}}" - id: extract_text_docx name: extract_text_docx tool: DOCX_EXTRACT_TEXT condition: "{{steps.determine_file_type.result.file_type == 'docx'}}" input: - name: base64_file value: "{{steps.upload_document.result.base64}}" - id: process_extracted_text name: process_extracted_text tool: PYTHON_SANDBOX_RUN input: - name: script value: | import json # Get extracted text based on file type pdf_text = {{steps.extract_text_pdf.result.text if steps.extract_text_pdf else ""}} docx_text = {{steps.extract_text_docx.result.text if steps.extract_text_docx else ""}} # Combine and clean text extracted_text = pdf_text or docx_text or "" # Basic text analysis word_count = len(extracted_text.split()) char_count = len(extracted_text) # Extract potential important sections lines = extracted_text.split('\n') important_lines = [line.strip() for line in lines if any(keyword in line.lower() for keyword in ['summary', 'conclusion', 'action', 'todo', 'next steps'])] result = { "text": extracted_text, "word_count": word_count, "character_count": char_count, "important_sections": important_lines[:10] # Limit to top 10 } print(json.dumps(result)) - id: generate_document_summary name: generate_document_summary tool: OPENAI_INVOKE config: - name: version value: gpt-4 input: - name: prompt value: | Please create a comprehensive summary of this document: Document: {{steps.determine_file_type.result.filename}} Word Count: {{steps.process_extracted_text.result.word_count}} Content: {{steps.process_extracted_text.result.text}} Please provide: 1. Executive Summary (2-3 sentences) 2. Key Topics and Themes 3. Important Facts or Data Points 4. Action Items or Recommendations 5. Overall Assessment
- id: get_document_list name: get_document_list tool: INPUT_JSON input: - name: value value: { "documents": [ {"url": "https://example.com/doc1.pdf", "name": "Report1"}, {"url": "https://example.com/doc2.docx", "name": "Report2"}, {"url": "https://example.com/doc3.pdf", "name": "Report3"} ] } - id: process_documents name: process_documents tool: PYTHON_SANDBOX_RUN input: - name: script value: | import json import requests import base64 documents = {{steps.get_document_list.result.documents}} processed_docs = [] for doc in documents: try: # Download document response = requests.get(doc['url']) if response.status_code == 200: base64_content = base64.b64encode(response.content).decode('utf-8') processed_docs.append({ "name": doc['name'], "url": doc['url'], "base64": base64_content, "type": "pdf" if doc['url'].endswith('.pdf') else "docx", "status": "ready" }) except Exception as e: processed_docs.append({ "name": doc['name'], "url": doc['url'], "status": "error", "error": str(e) }) print(json.dumps({"documents": processed_docs}))