invoice-processing-google-d.../services/utils/data_cleaner.py

# services/utils/data_cleaner.py
import logging
import locale
import re
from dateutil import parser
from typing import Optional, Dict
from datetime import datetime

# --- ESTA SECCIÓN NO REQUIERE CAMBIOS ---
SPANISH_TO_ENGLISH_MONTHS = { 'enero': 'january', 'febrero': 'february', 'marzo': 'march', 'abril': 'april', 'mayo': 'may', 'junio': 'june', 'julio': 'july', 'agosto': 'august', 'septiembre': 'september', 'octubre': 'october', 'noviembre': 'november', 'diciembre': 'december'}
def _parse_with_fallback(date_string: str) -> Optional[datetime]:
    temp_string = date_string.lower().replace(' de ', ' ').replace(' del ', ' ')
    for spa, eng in SPANISH_TO_ENGLISH_MONTHS.items():
        if spa in temp_string: temp_string = temp_string.replace(spa, eng); break
    try: return parser.parse(temp_string)
    except (parser.ParserError, ValueError): return None
def normalize_date(date_string: str) -> Optional[str]:
    if not date_string: return None
    original_locale = locale.getlocale(locale.LC_TIME)
    parsed_date = None
    try:
        try: locale.setlocale(locale.LC_TIME, 'es_ES.UTF-8')
        except locale.Error: locale.setlocale(locale.LC_TIME, 'Spanish')
        parsed_date = parser.parse(date_string)
    except (parser.ParserError, ValueError, locale.Error):
        parsed_date = _parse_with_fallback(date_string)
    finally: locale.setlocale(locale.LC_TIME, original_locale)
    return parsed_date.strftime('%d/%m/%Y') if parsed_date else None
# --- FIN DE LA SECCIÓN SIN CAMBIOS ---

def clean_numeric_value(text: str) -> str:
    """Función pública para limpiar y normalizar un string numérico."""
    if not text: return "0.00"
    cleaned = text.strip().replace('.', '').replace(',', '.')
    try: return f"{float(cleaned):.2f}"
    except (ValueError, TypeError):
        logging.warning(f"Could not convert '{text}' to a numeric value. Defaulting to 0.00.")
        return "0.00"

def parse_total_and_tax(text: str) -> Dict[str, Optional[str]]:
    """
    Versión final y robusta. Parsea un string que contiene el total y el IVA.
    """
    logging.info(f"Analizando línea de importe: '{text}'")
    result = {'total_amount': None, 'total_tax_amount': None}

    normalized_text = " ".join(text.split())

    # Regex para el total: busca el primer número después de la palabra "Total"
    total_match = re.search(r'total\s*([\d.,]+)', normalized_text, re.IGNORECASE)
    if total_match:
        result['total_amount'] = clean_numeric_value(total_match.group(1))

    # Regex de alta precisión para el IVA
    tax_match = re.search(r'\(.*?(?:incluye|incluido)\s+([\d.,]+)€?\s*IVA.*?\)', normalized_text, re.IGNORECASE)
    if tax_match:
        result['total_tax_amount'] = clean_numeric_value(tax_match.group(1))

    if result['total_amount'] and result['total_tax_amount']:
        logging.info(f"¡ÉXITO! Total: {result['total_amount']}, IVA: {result['total_tax_amount']}")
    else:
        logging.warning(f"Fallo en el parsing de importes para la línea: '{normalized_text}'")

    return result