# services/invoice_processor_service.py import logging from typing import Dict, List, Any, Optional import json # Necesario para formatear el JSON de salida # IMPORTAMOS EL TIPO Document y el MessageToJson para la depuración from google.cloud.documentai_v1.types import Document from google.protobuf.json_format import MessageToJson from .gcp_document_ai_client import process_document_gcp from .utils import data_cleaner from core.config import settings # --- La función _extract_specific_fields NO necesita cambios en esta fase de depuración --- # Puedes dejar la versión anterior, ya que el problema está en los datos de entrada que recibe. def _extract_specific_fields( document: Document, default_confidence_override: Optional[float] = None ) -> Dict[str, str]: # ... (código de la respuesta anterior, no es necesario cambiarlo ahora) extracted_data = {field: "Not found or low confidence" for field in settings.REQUIRED_FIELDS} default_threshold = default_confidence_override if default_confidence_override is not None else settings.CONFIDENCE_THRESHOLDS["__default__"] full_text_lines = document.text.split('\n') for entity in document.entities: entity_type = entity.type_ if entity_type not in settings.REQUIRED_FIELDS or entity_type in ['total_tax_amount', 'subtotal_amount']: continue threshold = settings.CONFIDENCE_THRESHOLDS.get(entity_type, default_threshold) if entity.confidence >= threshold: raw_text = entity.mention_text.strip() if entity_type == 'invoice_date': extracted_data[entity_type] = data_cleaner.normalize_date(raw_text) or f"Unparseable Date: '{raw_text}'" elif entity_type == 'total_amount': contextual_line = None logging.info(f"Buscando contexto para '{raw_text}' con la palabra clave 'Total'") for line in full_text_lines: if raw_text in line and "total" in line.lower(): contextual_line = line logging.info(f"Contexto definitivo para total_amount encontrado: '{contextual_line}'") break text_to_parse = contextual_line if contextual_line else raw_text parsed_amounts = data_cleaner.parse_total_and_tax(text_to_parse) total_str = parsed_amounts.get('total_amount') tax_str = parsed_amounts.get('total_tax_amount') if total_str: extracted_data['total_amount'] = total_str if tax_str: extracted_data['total_tax_amount'] = tax_str try: subtotal = float(total_str) - float(tax_str) subtotal_str = f"{subtotal:.2f}" extracted_data['subtotal_amount'] = subtotal_str extracted_data['net_amount'] = subtotal_str except (ValueError, TypeError): logging.error("Error de conversión para cálculo de subtotal.") else: extracted_data['total_tax_amount'] = '0.00' extracted_data['subtotal_amount'] = total_str if extracted_data.get('net_amount') == "Not found or low confidence": extracted_data['net_amount'] = total_str elif entity_type in ['net_amount', 'subtotal_amount']: if extracted_data.get(entity_type) == "Not found or low confidence": extracted_data[entity_type] = data_cleaner.clean_numeric_value(raw_text) else: extracted_data[entity_type] = raw_text.replace('\n', ' ').strip() return extracted_data def process_invoice_from_bytes( file_bytes: bytes, mime_type: str, default_confidence_override: Optional[float] = None ) -> Dict[str, str]: """ Orquesta el proceso completo e imprime la respuesta de GCP para depuración. """ try: document = process_document_gcp( project_id=settings.GCP_PROJECT_ID, location=settings.GCP_LOCATION, processor_id=settings.DOCAI_PROCESSOR_ID, file_bytes=file_bytes, mime_type=mime_type, ) # --- INICIO DEL BLOQUE DE DEPURACIÓN --- # Convertimos la respuesta completa del objeto 'Document' a un JSON legible. document_json = MessageToJson(document._pb) # Lo cargamos como un objeto Python para poder formatearlo bonito (indentado). document_dict = json.loads(document_json) # Imprimimos en el log de la consola con un formato claro. logging.critical("\n\n" + "="*20 + " INICIO RESPUESTA COMPLETA DE DOCUMENT AI " + "="*20) logging.critical(json.dumps(document_dict, indent=2, ensure_ascii=False)) logging.critical("="*20 + " FIN RESPUESTA COMPLETA DE DOCUMENT AI " + "="*20 + "\n\n") # --- FIN DEL BLOQUE DE DEPURACIÓN --- validated_data = _extract_specific_fields(document, default_confidence_override) logging.info(f"Datos finales procesados: {validated_data}") return validated_data except Exception as e: logging.error(f"Error en el flujo de procesamiento de factura: {e}", exc_info=True) raise