| """ |
| Configuration settings for Data Extractor Using Gemini |
| Optimized for Gemini-only model usage with robust directory management |
| """ |
|
|
| import os |
| from pathlib import Path |
| from dotenv import load_dotenv |
| import logging |
|
|
| |
| load_dotenv() |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| class Settings: |
| """Configuration settings with Gemini-only model support and robust directory management.""" |
| |
| |
| GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") |
| |
| |
| DATA_EXTRACTOR_MODEL = os.getenv("DATA_EXTRACTOR_MODEL", "gemini-2.5-pro") |
| DATA_ARRANGER_MODEL = os.getenv("DATA_ARRANGER_MODEL", "gemini-2.5-pro") |
| CODE_GENERATOR_MODEL = os.getenv("CODE_GENERATOR_MODEL", "gemini-2.5-flash") |
| |
| |
| DATA_EXTRACTOR_MODEL_THINKING_BUDGET = int(os.getenv("DATA_EXTRACTOR_THINKING_BUDGET", "4096")) |
| DATA_ARRANGER_MODEL_THINKING_BUDGET = int(os.getenv("DATA_ARRANGER_THINKING_BUDGET", "4096")) |
| CODE_GENERATOR_MODEL_THINKING_BUDGET = int(os.getenv("CODE_GENERATOR_THINKING_BUDGET", "4096")) |
| |
| |
| MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "50")) |
| SUPPORTED_FILE_TYPES = [ |
| "pdf", "txt", "docx", "xlsx", "csv", "md", "json", "xml", "html", |
| "png", "jpg", "jpeg", "doc", "xls", "ppt", "pptx" |
| ] |
| |
| |
| |
| WORKING_DIR = Path(os.getenv("WORKING_DIR", "/tmp/data_extractor_gemini")) |
| |
| |
| TEMP_DIR = WORKING_DIR / "temp" |
| INPUT_DIR = WORKING_DIR / "input" |
| OUTPUT_DIR = WORKING_DIR / "output" |
| CACHE_DIR = WORKING_DIR / "cache" |
| LOGS_DIR = WORKING_DIR / "logs" |
| |
| |
| |
| MAX_RETRIES = int(os.getenv("MAX_RETRIES", "3")) |
| RETRY_DELAY_SECONDS = int(os.getenv("RETRY_DELAY_SECONDS", "5")) |
| AGENT_TIMEOUT_SECONDS = int(os.getenv("AGENT_TIMEOUT_SECONDS", "300")) |
| |
| |
| ENABLE_CACHING = os.getenv("ENABLE_CACHING", "true").lower() == "true" |
| CACHE_TTL_HOURS = int(os.getenv("CACHE_TTL_HOURS", "24")) |
| |
| @classmethod |
| def initialize_directories(cls): |
| """Initialize all required directories with proper permissions.""" |
| directories = [ |
| cls.WORKING_DIR, |
| cls.TEMP_DIR, |
| cls.INPUT_DIR, |
| cls.OUTPUT_DIR, |
| cls.CACHE_DIR, |
| cls.LOGS_DIR |
| ] |
| |
| created_dirs = [] |
| for directory in directories: |
| try: |
| directory.mkdir(parents=True, exist_ok=True) |
| |
| |
| test_file = directory / ".write_test" |
| test_file.write_text("test") |
| test_file.unlink() |
| |
| created_dirs.append(str(directory)) |
| logger.debug(f"Directory initialized: {directory}") |
| |
| except Exception as e: |
| logger.error(f"Failed to initialize directory {directory}: {e}") |
| raise RuntimeError(f"Cannot create or write to directory {directory}: {e}") |
| |
| logger.info(f"Successfully initialized {len(created_dirs)} directories") |
| return created_dirs |
| |
| @classmethod |
| def validate_config(cls): |
| """Comprehensive configuration validation with detailed error reporting.""" |
| errors = [] |
| warnings = [] |
| |
| |
| |
| |
| if not cls.GOOGLE_API_KEY: |
| errors.append("GOOGLE_API_KEY is required. Get it from https://aistudio.google.com/app/apikey") |
| elif len(cls.GOOGLE_API_KEY) < 30: |
| warnings.append("GOOGLE_API_KEY appears to be too short - verify it's correct") |
| |
| |
| gemini_models = [cls.DATA_EXTRACTOR_MODEL, cls.DATA_ARRANGER_MODEL, cls.CODE_GENERATOR_MODEL] |
| for i, model in enumerate(gemini_models): |
| model_names = ["DATA_EXTRACTOR_MODEL", "DATA_ARRANGER_MODEL", "CODE_GENERATOR_MODEL"] |
| if not model: |
| errors.append(f"{model_names[i]} cannot be empty") |
| elif not model.startswith("gemini-"): |
| errors.append(f"{model_names[i]} must be a Gemini model (starts with 'gemini-'), got: {model}") |
| |
| |
| try: |
| cls.initialize_directories() |
| except Exception as e: |
| errors.append(f"Directory initialization failed: {e}") |
| |
| |
| |
| |
| if cls.MAX_FILE_SIZE_MB <= 0: |
| errors.append("MAX_FILE_SIZE_MB must be positive") |
| elif cls.MAX_FILE_SIZE_MB > 100: |
| warnings.append(f"MAX_FILE_SIZE_MB ({cls.MAX_FILE_SIZE_MB}) is very large - may cause memory issues") |
| |
| |
| if not cls.SUPPORTED_FILE_TYPES: |
| errors.append("SUPPORTED_FILE_TYPES cannot be empty") |
| |
| |
| budgets = [ |
| (cls.DATA_EXTRACTOR_MODEL_THINKING_BUDGET, "DATA_EXTRACTOR_MODEL_THINKING_BUDGET"), |
| (cls.DATA_ARRANGER_MODEL_THINKING_BUDGET, "DATA_ARRANGER_MODEL_THINKING_BUDGET"), |
| (cls.CODE_GENERATOR_MODEL_THINKING_BUDGET, "CODE_GENERATOR_MODEL_THINKING_BUDGET") |
| ] |
| |
| for budget, name in budgets: |
| if budget < 1024: |
| warnings.append(f"{name} ({budget}) is quite low - may affect model performance") |
| elif budget > 8192: |
| warnings.append(f"{name} ({budget}) is very high - may be unnecessary") |
| |
| |
| if cls.MAX_RETRIES < 1: |
| warnings.append("MAX_RETRIES should be at least 1") |
| elif cls.MAX_RETRIES > 10: |
| warnings.append("MAX_RETRIES is very high - may cause long delays") |
| |
| |
| |
| if errors: |
| error_msg = "❌ Configuration validation failed:\n" |
| error_msg += "\n".join(f" • {error}" for error in errors) |
| |
| if warnings: |
| error_msg += "\n\n⚠️ Warnings:\n" |
| error_msg += "\n".join(f" • {warning}" for warning in warnings) |
| |
| raise ValueError(error_msg) |
| |
| if warnings: |
| logger.warning("Configuration warnings detected:") |
| for warning in warnings: |
| logger.warning(f" • {warning}") |
| |
| logger.info("✅ Configuration validation successful") |
| return True |
| |
| @classmethod |
| def get_session_directories(cls, session_id: str): |
| """Get session-specific directory structure.""" |
| session_base = cls.WORKING_DIR / session_id |
| |
| return { |
| "base": session_base, |
| "input": session_base / "input", |
| "output": session_base / "output", |
| "temp": session_base / "temp", |
| "cache": session_base / "cache" |
| } |
| |
| @classmethod |
| def create_session_directories(cls, session_id: str): |
| """Create and validate session-specific directories.""" |
| session_dirs = cls.get_session_directories(session_id) |
| |
| created = [] |
| for name, directory in session_dirs.items(): |
| try: |
| directory.mkdir(parents=True, exist_ok=True) |
| |
| |
| test_file = directory / ".write_test" |
| test_file.write_text("test") |
| test_file.unlink() |
| |
| created.append(str(directory)) |
| |
| except Exception as e: |
| logger.error(f"Failed to create session directory {name}: {e}") |
| raise RuntimeError(f"Cannot create session directory {directory}: {e}") |
| |
| logger.info(f"Created {len(created)} session directories for {session_id}") |
| return session_dirs |
| |
| @classmethod |
| def cleanup_session(cls, session_id: str, keep_output: bool = True): |
| """Clean up session directories with option to preserve output.""" |
| session_dirs = cls.get_session_directories(session_id) |
| |
| import shutil |
| cleaned = [] |
| |
| for name, directory in session_dirs.items(): |
| if keep_output and name == "output": |
| continue |
| |
| if directory.exists(): |
| try: |
| shutil.rmtree(directory) |
| cleaned.append(str(directory)) |
| except Exception as e: |
| logger.warning(f"Could not clean {name} directory: {e}") |
| |
| logger.info(f"Cleaned {len(cleaned)} session directories for {session_id}") |
| return cleaned |
| |
| @classmethod |
| def get_debug_info(cls): |
| """Get comprehensive debug information about current configuration.""" |
| import platform |
| import sys |
| |
| return { |
| "python_version": sys.version, |
| "platform": platform.platform(), |
| "temp_dir": str(cls.TEMP_DIR), |
| "temp_dir_exists": cls.TEMP_DIR.exists(), |
| "models": { |
| "data_extractor": cls.DATA_EXTRACTOR_MODEL, |
| "data_arranger": cls.DATA_ARRANGER_MODEL, |
| "code_generator": cls.CODE_GENERATOR_MODEL, |
| }, |
| "api_keys": { |
| "google_api_key_present": bool(cls.GOOGLE_API_KEY), |
| "google_api_key_length": len(cls.GOOGLE_API_KEY) if cls.GOOGLE_API_KEY else 0 |
| } |
| } |
|
|
|
|
| |
| settings = Settings() |
|
|
| |
| try: |
| settings.initialize_directories() |
| logger.debug("Settings initialized successfully") |
| except Exception as e: |
| logger.error(f"Failed to initialize settings: {e}") |
| |
|
|
|
|