- id: retrieve_data_for_analysis
name: retrieve_data_for_analysis
tool: GOOGLE_SHEETS_GET_SHEET
config:
- name: credentials
value: "{{secrets.GOOGLE_SHEETS_CREDENTIALS}}"
input:
- name: sheet_id
value: "{{input.analysis_sheet_id}}"
- name: sheet_name
value: "{{input.sheet_name}}"
- name: include_formatting
value: true
- name: include_metadata
value: true
- id: perform_data_quality_analysis
name: perform_data_quality_analysis
tool: PYTHON_SANDBOX_RUN
input:
- name: code
value: |
import json
import re
from collections import Counter
# Analyze data quality
sheet_data = json.loads('''{{steps.retrieve_data_for_analysis.result.sheet_data}}''')
rows = sheet_data.get('rows', [])
if not rows:
print("No data found for analysis")
else:
headers = rows[0] if rows else []
data_rows = rows[1:] if len(rows) > 1 else []
print("Data Quality Analysis Report")
print("=" * 35)
print(f"Dataset: {sheet_data.get('sheet_name', 'Unknown')}")
print(f"Total Records: {len(data_rows)}")
print(f"Total Columns: {len(headers)}")
print()
# Column-by-column analysis
quality_report = {
"overall_stats": {
"total_records": len(data_rows),
"total_columns": len(headers),
"completeness_score": 0
},
"column_analysis": [],
"data_issues": []
}
total_cells = 0
empty_cells = 0
for col_idx, header in enumerate(headers):
col_values = [row[col_idx] if col_idx < len(row) else "" for row in data_rows]
# Calculate column statistics
non_empty_values = [v for v in col_values if v.strip()]
empty_count = len(col_values) - len(non_empty_values)
completeness = (len(non_empty_values) / len(col_values)) * 100 if col_values else 0
# Data type detection
numeric_count = sum(1 for v in non_empty_values if str(v).replace('.', '').replace('-', '').isdigit())
date_count = sum(1 for v in non_empty_values if re.match(r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}', str(v)))
email_count = sum(1 for v in non_empty_values if '@' in str(v) and '.' in str(v))
likely_type = "text"
if numeric_count / len(non_empty_values) > 0.8 if non_empty_values else False:
likely_type = "numeric"
elif date_count / len(non_empty_values) > 0.5 if non_empty_values else False:
likely_type = "date"
elif email_count / len(non_empty_values) > 0.5 if non_empty_values else False:
likely_type = "email"
col_analysis = {
"column_name": header,
"completeness_percent": round(completeness, 2),
"empty_cells": empty_count,
"likely_data_type": likely_type,
"unique_values": len(set(non_empty_values)),
"sample_values": non_empty_values[:3]
}
quality_report["column_analysis"].append(col_analysis)
total_cells += len(col_values)
empty_cells += empty_count
# Identify potential issues
if completeness < 90:
quality_report["data_issues"].append(f"Column '{header}' has low completeness ({completeness:.1f}%)")
if len(set(non_empty_values)) == 1 and non_empty_values:
quality_report["data_issues"].append(f"Column '{header}' has only one unique value")
# Overall completeness score
overall_completeness = ((total_cells - empty_cells) / total_cells) * 100 if total_cells > 0 else 0
quality_report["overall_stats"]["completeness_score"] = round(overall_completeness, 2)
print(f"Overall Data Completeness: {overall_completeness:.1f}%")
print()
# Display column analysis
print("Column Analysis:")
for col in quality_report["column_analysis"]:
print(f" 📊 {col['column_name']}")
print(f" Completeness: {col['completeness_percent']}%")
print(f" Type: {col['likely_data_type']}")
print(f" Unique Values: {col['unique_values']}")
print()
# Display issues
if quality_report["data_issues"]:
print("🚨 Data Quality Issues:")
for issue in quality_report["data_issues"]:
print(f" - {issue}")
else:
print("✅ No major data quality issues detected")
print("\n" + json.dumps(quality_report, indent=2))
- id: create_quality_recommendations
name: create_quality_recommendations
tool: OPENAI_INVOKE
config:
- name: version
value: gpt-4
input:
- name: prompt
value: |
Based on this data quality analysis report, please provide specific recommendations for improving data quality:
{{steps.perform_data_quality_analysis.result.stdout}}
Please provide:
1. Priority issues to address first
2. Specific steps to improve data completeness
3. Data validation rules to implement
4. Best practices for ongoing data maintenance
5. Suggested data cleaning workflows
- id: save_quality_analysis_report
name: save_quality_analysis_report
tool: OUTPUT_FILE
input:
- name: content
value: |
Google Sheets Data Quality Analysis Report
========================================
Generated: {{date | format('YYYY-MM-DD HH:mm:ss')}}
## Analysis Results
{{steps.perform_data_quality_analysis.result.stdout}}
## Recommendations
{{steps.create_quality_recommendations.result.content}}
## Next Steps
1. Address high-priority data quality issues
2. Implement recommended data validation rules
3. Set up regular data quality monitoring
4. Train users on data entry best practices
5. Consider automated data cleaning workflows
- name: filename
value: "data_quality_analysis_{{date | format('YYYY-MM-DD')}}.txt"
- name: fileType
value: "txt"