From d31765a72c20e64fa3d0d83b700609d8dd7d489c Mon Sep 17 00:00:00 2001
From: Daniel Oscar Zamo <daniel.oscar.zamo@gmail.com>
Date: Wed, 27 Aug 2025 22:21:11 +0200
Subject: [PATCH] status beta 000.002

---
 core/config.py                        |  10 +-
 services/invoice_processor_service.py |  91 +++++++++++-----
 services/utils/data_cleaner.py        | 149 ++++++++++++++------------
 3 files changed, 148 insertions(+), 102 deletions(-)

diff --git a/core/config.py b/core/config.py
index 8c0e96d..1090f08 100644
--- a/core/config.py
+++ b/core/config.py
@@ -20,9 +20,11 @@ class Settings(BaseSettings):
         "invoice_id",
         "invoice_date",
         "total_amount",
-        # "net_amount",
+        "net_amount", # Podríamos considerar renombrar esto a subtotal_amount en el futuro
         "receiver_name",
-        "supplier_tax_id"
+        "supplier_tax_id",
+        "total_tax_amount",
+        "subtotal_amount"  # <-- NUEVO CAMPO
     ]
 
     # --- CAMBIO PARA DEPURACIÓN ---
@@ -31,7 +33,9 @@ class Settings(BaseSettings):
         "__default__": 0.82,      
         "supplier_name": 0.80,
         "total_amount": 0.75,
-        # "net_amount": 0.92,
+        "subtotal_amount": 0.75, # Un umbral razonable
+        "net_amount": 0.92,
+        "total_tax_amount": 0.0, # Ponemos 0.0 porque no viene de DocumentAI, lo calculamos nosotros
         "receiver_name": 0.74,
         "supplier_tax_id": 0.46
     }
diff --git a/services/invoice_processor_service.py b/services/invoice_processor_service.py
index 8beb716..617319a 100644
--- a/services/invoice_processor_service.py
+++ b/services/invoice_processor_service.py
@@ -1,53 +1,90 @@
 # services/invoice_processor_service.py
 import logging
 from typing import Dict, List, Any, Optional
+from google.cloud.documentai_v1.types import Document
 
 from .gcp_document_ai_client import process_document_gcp
 from .utils import data_cleaner
 from core.config import settings
 
 def _extract_specific_fields(
-    entities: List[Any],
-    # El nombre del parámetro aquí debe coincidir con el que se le pasa desde el router
+    document: Document,
     default_confidence_override: Optional[float] = None
 ) -> Dict[str, str]:
     """
-    Filtra y normaliza entidades. Si se proporciona `default_confidence_override`,
-    se utiliza para el umbral por defecto. Los umbrales específicos de la
-    configuración siempre tienen prioridad.
+    Extrae datos usando una lógica de búsqueda contextual por palabra clave para
+    resolver ambigüedades en el documento.
     """
     extracted_data = {field: "Not found or low confidence" for field in settings.REQUIRED_FIELDS}
-
-    # Determina el umbral por defecto para esta ejecución
     default_threshold = default_confidence_override if default_confidence_override is not None else settings.CONFIDENCE_THRESHOLDS["__default__"]
 
-    for entity in entities:
+    full_text_lines = document.text.split('\n')
+
+    for entity in document.entities:
         entity_type = entity.type_
+        
+        if entity_type not in settings.REQUIRED_FIELDS or entity_type in ['total_tax_amount', 'subtotal_amount']:
+            continue
 
-        if entity_type in settings.REQUIRED_FIELDS:
-            # Lógica corregida: Prioriza el umbral específico del campo, si no, usa el por defecto.
-            threshold = settings.CONFIDENCE_THRESHOLDS.get(entity_type, default_threshold)
+        threshold = settings.CONFIDENCE_THRESHOLDS.get(entity_type, default_threshold)
 
-            if entity.confidence >= threshold:
-                value = entity.mention_text.replace('\n', ' ').strip()
-                
-                if entity_type == 'invoice_date':
-                    normalized_date = data_cleaner.normalize_date(value)
-                    value = normalized_date if normalized_date else f"Unparseable Date: '{value}'"
-                
-                extracted_data[entity_type] = value
+        if entity.confidence >= threshold:
+            raw_text = entity.mention_text.strip()
+
+            if entity_type == 'invoice_date':
+                extracted_data[entity_type] = data_cleaner.normalize_date(raw_text) or f"Unparseable Date: '{raw_text}'"
             
+            elif entity_type == 'total_amount':
+                # --- LÓGICA DE BÚSQUEDA CONTEXTUAL POR PALABRA CLAVE ---
+                contextual_line = None
+                logging.info(f"Buscando contexto para '{raw_text}' con la palabra clave 'Total'")
+                for line in full_text_lines:
+                    # La línea debe contener el texto del importe Y la palabra "total"
+                    if raw_text in line and "total" in line.lower():
+                        contextual_line = line
+                        logging.info(f"Contexto definitivo para total_amount encontrado: '{contextual_line}'")
+                        break
+                
+                # Si no encontramos una línea contextual, usamos el texto original como fallback
+                text_to_parse = contextual_line if contextual_line else raw_text
+                
+                parsed_amounts = data_cleaner.parse_total_and_tax(text_to_parse)
+                
+                total_str = parsed_amounts.get('total_amount')
+                tax_str = parsed_amounts.get('total_tax_amount')
+
+                if total_str:
+                    extracted_data['total_amount'] = total_str
+                    if tax_str:
+                        extracted_data['total_tax_amount'] = tax_str
+                        try:
+                            subtotal = float(total_str) - float(tax_str)
+                            subtotal_str = f"{subtotal:.2f}"
+                            extracted_data['subtotal_amount'] = subtotal_str
+                            extracted_data['net_amount'] = subtotal_str
+                        except (ValueError, TypeError):
+                            logging.error("Error de conversión para cálculo de subtotal.")
+                    else:
+                        extracted_data['total_tax_amount'] = '0.00'
+                        extracted_data['subtotal_amount'] = total_str
+                        if extracted_data.get('net_amount') == "Not found or low confidence":
+                             extracted_data['net_amount'] = total_str
+            
+            elif entity_type in ['net_amount', 'subtotal_amount']:
+                 # Evitamos procesar estos campos directamente si ya los hemos calculado
+                 if extracted_data.get(entity_type) == "Not found or low confidence":
+                    extracted_data[entity_type] = data_cleaner.clean_numeric_value(raw_text)
+            else:
+                extracted_data[entity_type] = raw_text.replace('\n', ' ').strip()
+                
     return extracted_data
 
 def process_invoice_from_bytes(
     file_bytes: bytes, 
     mime_type: str,
-    # El nombre del parámetro aquí debe coincidir con el del router
     default_confidence_override: Optional[float] = None
 ) -> Dict[str, str]:
-    """
-    Orquesta el proceso completo.
-    """
+    """ Orquesta el proceso completo. """
     try:
         document = process_document_gcp(
             project_id=settings.GCP_PROJECT_ID,
@@ -56,13 +93,9 @@ def process_invoice_from_bytes(
             file_bytes=file_bytes,
             mime_type=mime_type,
         )
-
-        validated_data = _extract_specific_fields(document.entities, default_confidence_override)
-        
-        log_threshold = default_confidence_override if default_confidence_override is not None else "config default"
-        logging.info(f"Documento procesado. Umbral por defecto usado: {log_threshold}.")
+        validated_data = _extract_specific_fields(document, default_confidence_override)
+        logging.info(f"Datos finales procesados: {validated_data}")
         return validated_data
-
     except Exception as e:
         logging.error(f"Error en el flujo de procesamiento de factura: {e}", exc_info=True)
         raise
\ No newline at end of file
diff --git a/services/utils/data_cleaner.py b/services/utils/data_cleaner.py
index 45294df..e18a669 100644
--- a/services/utils/data_cleaner.py
+++ b/services/utils/data_cleaner.py
@@ -1,87 +1,96 @@
 # services/utils/data_cleaner.py
 import logging
 import locale
+import re
 from dateutil import parser
-from typing import Optional
+from typing import Optional, Dict
 from datetime import datetime
 
-SPANISH_TO_ENGLISH_MONTHS = {
-    'enero': 'january',
-    'febrero': 'february',
-    'marzo': 'march',
-    'abril': 'april',
-    'mayo': 'may',
-    'junio': 'june',
-    'julio': 'july',
-    'agosto': 'august',
-    'septiembre': 'september',
-    'octubre': 'october',
-    'noviembre': 'november',
-    'diciembre': 'december'
-}
-
+# --- ESTA SECCIÓN NO REQUIERE CAMBIOS ---
+SPANISH_TO_ENGLISH_MONTHS = { 'enero': 'january', 'febrero': 'february', 'marzo': 'march', 'abril': 'april', 'mayo': 'may', 'junio': 'june', 'julio': 'july', 'agosto': 'august', 'septiembre': 'september', 'octubre': 'october', 'noviembre': 'november', 'diciembre': 'december'}
 def _parse_with_fallback(date_string: str) -> Optional[datetime]:
-    """
-    Intenta parsear la fecha usando un fallback manual que primero limpia
-    preposiciones comunes en español ("de", "del") y luego traduce los meses.
-    """
-    # 1. Normalizar a minúsculas para trabajar de forma consistente
-    temp_string = date_string.lower()
-
-    # 2. Traducir el mes de español a inglés
+    temp_string = date_string.lower().replace(' de ', ' ').replace(' del ', ' ')
     for spa, eng in SPANISH_TO_ENGLISH_MONTHS.items():
-        if spa in temp_string:
-            temp_string = temp_string.replace(spa, eng)
-            break # Salimos del bucle una vez que encontramos y reemplazamos el mes
-
-    # 3. Eliminar preposiciones comunes, cuidando los espacios para evitar unir palabras
-    temp_string = temp_string.replace(' de ', ' ')
-    temp_string = temp_string.replace(' del ', ' ')
-    
-    # Después de la limpieza, la cadena debería ser algo como '5 january 2030', que es parseable.
-    try:
-        logging.info(f"Attempting to parse cleaned date string: '{temp_string}'")
-        return parser.parse(temp_string)
-    except (parser.ParserError, ValueError):
-        # Si incluso después de la limpieza falla, no podemos hacer más.
-        logging.warning(f"Fallback parsing failed even for cleaned string: '{temp_string}'")
-        return None
-
+        if spa in temp_string: temp_string = temp_string.replace(spa, eng); break
+    try: return parser.parse(temp_string)
+    except (parser.ParserError, ValueError): return None
 def normalize_date(date_string: str) -> Optional[str]:
-    """
-    Parses a date string from various formats and normalizes it to DD/MM/YYYY.
-    It first tries using Spanish locale, and if it fails, it uses a manual
-    cleaning and translation fallback.
-    """
-    if not date_string:
-        return None
-
+    if not date_string: return None
     original_locale = locale.getlocale(locale.LC_TIME)
     parsed_date = None
-    
-    # Estrategia 1: Intentar con el locale español
     try:
-        try:
-            locale.setlocale(locale.LC_TIME, 'es_ES.UTF-8')
-        except locale.Error:
-            locale.setlocale(locale.LC_TIME, 'Spanish')
-
+        try: locale.setlocale(locale.LC_TIME, 'es_ES.UTF-8')
+        except locale.Error: locale.setlocale(locale.LC_TIME, 'Spanish')
         parsed_date = parser.parse(date_string)
-
     except (parser.ParserError, ValueError, locale.Error):
-        logging.warning(f"Could not parse date '{date_string}' using Spanish locale. Attempting robust fallback.")
-        # Estrategia 2: Si el locale falla, usar el fallback robusto
         parsed_date = _parse_with_fallback(date_string)
-    
-    finally:
-        # Siempre restauramos el locale original
-        locale.setlocale(locale.LC_TIME, original_locale)
+    finally: locale.setlocale(locale.LC_TIME, original_locale)
+    return parsed_date.strftime('%d/%m/%Y') if parsed_date else None
+# --- FIN DE LA SECCIÓN SIN CAMBIOS ---
 
-    if parsed_date:
-        # Aquí se asegura el formato DD/MM/AAAA.
-        # '%d' -> día con cero (05), '%m' -> mes con cero (01), '%Y' -> año (2030)
-        return parsed_date.strftime('%d/%m/%Y')
+
+def clean_numeric_value(text: str) -> str:
+    """Función pública para limpiar y normalizar un string numérico."""
+    if not text: return "0.00"
+    cleaned = text.strip().replace('.', '').replace(',', '.')
+    try: return f"{float(cleaned):.2f}"
+    except (ValueError, TypeError):
+        logging.warning(f"Could not convert '{text}' to a numeric value. Defaulting to 0.00.")
+        return "0.00"
+
+def parse_total_and_tax(text: str) -> Dict[str, Optional[str]]:
+    """
+    Versión final y robusta. Parsea un string que contiene el total y el IVA.
+    """
+    logging.info(f"Texto original recibido para parsing: '{text}'")
+    normalized_text = " ".join(text.split())
+    logging.info(f"Texto normalizado para la regex: '{normalized_text}'")
+
+    result = {'total_amount': None, 'total_tax_amount': None}
+
+    total_match = re.search(r'([\d.,]+)', normalized_text)
+    if total_match:
+        result['total_amount'] = clean_numeric_value(total_match.group(1))
+
+    # Regex Definitiva: más tolerante con el texto que rodea al número del IVA
+    tax_match = re.search(r'\(.*?(?:incluye|incluido|iva)\s+([\d.,]+).*?\)', normalized_text, re.IGNORECASE)
+    
+    if tax_match:
+        result['total_tax_amount'] = clean_numeric_value(tax_match.group(1))
+        logging.info(f"¡ÉXITO! Importe de IVA encontrado y limpiado: {result['total_tax_amount']}")
     else:
-        # Si ambas estrategias fallan, registramos el error final
-        logging.error(f"Failed to parse date '{date_string}' with all available methods.")
-        return None
\ No newline at end of file
+        logging.warning(f"No se encontró desglose de IVA en el texto normalizado: '{normalized_text}'")
+
+    return result
+
+def parse_total_and_tax_LEGACY(text: str) -> Dict[str, Optional[str]]:
+    """
+    Versión final y robusta. Parsea un string que contiene el total y el IVA.
+    Primero normaliza los espacios en blanco y luego aplica una regex de alta precisión.
+    """
+    logging.info(f"Texto original recibido para parsing: '{text}'")
+    
+    # --- PASO 1: PRE-PROCESAMIENTO Y NORMALIZACIÓN DEL TEXTO ---
+    # Esto convierte saltos de línea, tabs y espacios múltiples en un único espacio.
+    # Ej: "398,00€\n  (incluye..." -> "398,00€ (incluye..."
+    normalized_text = " ".join(text.split())
+    logging.info(f"Texto normalizado para la regex: '{normalized_text}'")
+
+    result = {'total_amount': None, 'total_tax_amount': None}
+
+    # 2. Extraer el importe total (el primer número que encuentre del texto normalizado)
+    total_match = re.search(r'([\d.,]+)', normalized_text)
+    if total_match:
+        result['total_amount'] = clean_numeric_value(total_match.group(1))
+        logging.info(f"Importe total encontrado y limpiado: {result['total_amount']}")
+
+    # 3. Regex de alta precisión aplicada sobre el texto normalizado.
+    tax_match = re.search(r'\(.*?(?:incluye|incluido)\s+([\d.,]+)€?\s*IVA.*?\)', normalized_text, re.IGNORECASE)
+    
+    if tax_match:
+        result['total_tax_amount'] = clean_numeric_value(tax_match.group(1))
+        logging.info(f"¡ÉXITO! Importe de IVA encontrado y limpiado: {result['total_tax_amount']}")
+    else:
+        logging.warning(f"No se encontró desglose de IVA en el texto normalizado: '{normalized_text}'")
+
+    return result
\ No newline at end of file