diff --git a/services/VERSION_PREVIA_A_invoice_processor_service_DEBUG_COMPLETO_objeto_document.py b/services/VERSION_PREVIA_A_invoice_processor_service_DEBUG_COMPLETO_objeto_document.py new file mode 100644 index 0000000..617319a --- /dev/null +++ b/services/VERSION_PREVIA_A_invoice_processor_service_DEBUG_COMPLETO_objeto_document.py @@ -0,0 +1,101 @@ +# services/invoice_processor_service.py +import logging +from typing import Dict, List, Any, Optional +from google.cloud.documentai_v1.types import Document + +from .gcp_document_ai_client import process_document_gcp +from .utils import data_cleaner +from core.config import settings + +def _extract_specific_fields( + document: Document, + default_confidence_override: Optional[float] = None +) -> Dict[str, str]: + """ + Extrae datos usando una lógica de búsqueda contextual por palabra clave para + resolver ambigüedades en el documento. + """ + extracted_data = {field: "Not found or low confidence" for field in settings.REQUIRED_FIELDS} + default_threshold = default_confidence_override if default_confidence_override is not None else settings.CONFIDENCE_THRESHOLDS["__default__"] + + full_text_lines = document.text.split('\n') + + for entity in document.entities: + entity_type = entity.type_ + + if entity_type not in settings.REQUIRED_FIELDS or entity_type in ['total_tax_amount', 'subtotal_amount']: + continue + + threshold = settings.CONFIDENCE_THRESHOLDS.get(entity_type, default_threshold) + + if entity.confidence >= threshold: + raw_text = entity.mention_text.strip() + + if entity_type == 'invoice_date': + extracted_data[entity_type] = data_cleaner.normalize_date(raw_text) or f"Unparseable Date: '{raw_text}'" + + elif entity_type == 'total_amount': + # --- LÓGICA DE BÚSQUEDA CONTEXTUAL POR PALABRA CLAVE --- + contextual_line = None + logging.info(f"Buscando contexto para '{raw_text}' con la palabra clave 'Total'") + for line in full_text_lines: + # La línea debe contener el texto del importe Y la palabra "total" + if raw_text in line and "total" in line.lower(): + contextual_line = line + logging.info(f"Contexto definitivo para total_amount encontrado: '{contextual_line}'") + break + + # Si no encontramos una línea contextual, usamos el texto original como fallback + text_to_parse = contextual_line if contextual_line else raw_text + + parsed_amounts = data_cleaner.parse_total_and_tax(text_to_parse) + + total_str = parsed_amounts.get('total_amount') + tax_str = parsed_amounts.get('total_tax_amount') + + if total_str: + extracted_data['total_amount'] = total_str + if tax_str: + extracted_data['total_tax_amount'] = tax_str + try: + subtotal = float(total_str) - float(tax_str) + subtotal_str = f"{subtotal:.2f}" + extracted_data['subtotal_amount'] = subtotal_str + extracted_data['net_amount'] = subtotal_str + except (ValueError, TypeError): + logging.error("Error de conversión para cálculo de subtotal.") + else: + extracted_data['total_tax_amount'] = '0.00' + extracted_data['subtotal_amount'] = total_str + if extracted_data.get('net_amount') == "Not found or low confidence": + extracted_data['net_amount'] = total_str + + elif entity_type in ['net_amount', 'subtotal_amount']: + # Evitamos procesar estos campos directamente si ya los hemos calculado + if extracted_data.get(entity_type) == "Not found or low confidence": + extracted_data[entity_type] = data_cleaner.clean_numeric_value(raw_text) + else: + extracted_data[entity_type] = raw_text.replace('\n', ' ').strip() + + return extracted_data + +def process_invoice_from_bytes( + file_bytes: bytes, + mime_type: str, + default_confidence_override: Optional[float] = None +) -> Dict[str, str]: + """ Orquesta el proceso completo. """ + try: + document = process_document_gcp( + project_id=settings.GCP_PROJECT_ID, + location=settings.GCP_LOCATION, + processor_id=settings.DOCAI_PROCESSOR_ID, + file_bytes=file_bytes, + mime_type=mime_type, + ) + validated_data = _extract_specific_fields(document, default_confidence_override) + logging.info(f"Datos finales procesados: {validated_data}") + return validated_data + except Exception as e: + logging.error(f"Error en el flujo de procesamiento de factura: {e}", exc_info=True) + raise \ No newline at end of file diff --git a/services/invoice_processor_service.py b/services/invoice_processor_service.py index 617319a..c8c32e5 100644 --- a/services/invoice_processor_service.py +++ b/services/invoice_processor_service.py @@ -1,58 +1,47 @@ # services/invoice_processor_service.py import logging from typing import Dict, List, Any, Optional +import json # Necesario para formatear el JSON de salida + +# IMPORTAMOS EL TIPO Document y el MessageToJson para la depuración from google.cloud.documentai_v1.types import Document +from google.protobuf.json_format import MessageToJson from .gcp_document_ai_client import process_document_gcp from .utils import data_cleaner from core.config import settings +# --- La función _extract_specific_fields NO necesita cambios en esta fase de depuración --- +# Puedes dejar la versión anterior, ya que el problema está en los datos de entrada que recibe. def _extract_specific_fields( document: Document, default_confidence_override: Optional[float] = None ) -> Dict[str, str]: - """ - Extrae datos usando una lógica de búsqueda contextual por palabra clave para - resolver ambigüedades en el documento. - """ + # ... (código de la respuesta anterior, no es necesario cambiarlo ahora) extracted_data = {field: "Not found or low confidence" for field in settings.REQUIRED_FIELDS} default_threshold = default_confidence_override if default_confidence_override is not None else settings.CONFIDENCE_THRESHOLDS["__default__"] - full_text_lines = document.text.split('\n') - for entity in document.entities: entity_type = entity.type_ - if entity_type not in settings.REQUIRED_FIELDS or entity_type in ['total_tax_amount', 'subtotal_amount']: continue - threshold = settings.CONFIDENCE_THRESHOLDS.get(entity_type, default_threshold) - if entity.confidence >= threshold: raw_text = entity.mention_text.strip() - if entity_type == 'invoice_date': extracted_data[entity_type] = data_cleaner.normalize_date(raw_text) or f"Unparseable Date: '{raw_text}'" - elif entity_type == 'total_amount': - # --- LÓGICA DE BÚSQUEDA CONTEXTUAL POR PALABRA CLAVE --- contextual_line = None logging.info(f"Buscando contexto para '{raw_text}' con la palabra clave 'Total'") for line in full_text_lines: - # La línea debe contener el texto del importe Y la palabra "total" if raw_text in line and "total" in line.lower(): contextual_line = line logging.info(f"Contexto definitivo para total_amount encontrado: '{contextual_line}'") break - - # Si no encontramos una línea contextual, usamos el texto original como fallback text_to_parse = contextual_line if contextual_line else raw_text - parsed_amounts = data_cleaner.parse_total_and_tax(text_to_parse) - total_str = parsed_amounts.get('total_amount') tax_str = parsed_amounts.get('total_tax_amount') - if total_str: extracted_data['total_amount'] = total_str if tax_str: @@ -69,22 +58,20 @@ def _extract_specific_fields( extracted_data['subtotal_amount'] = total_str if extracted_data.get('net_amount') == "Not found or low confidence": extracted_data['net_amount'] = total_str - elif entity_type in ['net_amount', 'subtotal_amount']: - # Evitamos procesar estos campos directamente si ya los hemos calculado if extracted_data.get(entity_type) == "Not found or low confidence": extracted_data[entity_type] = data_cleaner.clean_numeric_value(raw_text) else: extracted_data[entity_type] = raw_text.replace('\n', ' ').strip() - return extracted_data + def process_invoice_from_bytes( file_bytes: bytes, mime_type: str, default_confidence_override: Optional[float] = None ) -> Dict[str, str]: - """ Orquesta el proceso completo. """ + """ Orquesta el proceso completo e imprime la respuesta de GCP para depuración. """ try: document = process_document_gcp( project_id=settings.GCP_PROJECT_ID, @@ -93,9 +80,23 @@ def process_invoice_from_bytes( file_bytes=file_bytes, mime_type=mime_type, ) + + # --- INICIO DEL BLOQUE DE DEPURACIÓN --- + # Convertimos la respuesta completa del objeto 'Document' a un JSON legible. + document_json = MessageToJson(document._pb) + # Lo cargamos como un objeto Python para poder formatearlo bonito (indentado). + document_dict = json.loads(document_json) + + # Imprimimos en el log de la consola con un formato claro. + logging.critical("\n\n" + "="*20 + " INICIO RESPUESTA COMPLETA DE DOCUMENT AI " + "="*20) + logging.critical(json.dumps(document_dict, indent=2, ensure_ascii=False)) + logging.critical("="*20 + " FIN RESPUESTA COMPLETA DE DOCUMENT AI " + "="*20 + "\n\n") + # --- FIN DEL BLOQUE DE DEPURACIÓN --- + validated_data = _extract_specific_fields(document, default_confidence_override) logging.info(f"Datos finales procesados: {validated_data}") return validated_data + except Exception as e: logging.error(f"Error en el flujo de procesamiento de factura: {e}", exc_info=True) raise \ No newline at end of file diff --git a/services/invoice_processor_service_DEBUG_COMPLETO_objeto_document.py b/services/invoice_processor_service_DEBUG_COMPLETO_objeto_document.py new file mode 100644 index 0000000..c8c32e5 --- /dev/null +++ b/services/invoice_processor_service_DEBUG_COMPLETO_objeto_document.py @@ -0,0 +1,102 @@ +# services/invoice_processor_service.py +import logging +from typing import Dict, List, Any, Optional +import json # Necesario para formatear el JSON de salida + +# IMPORTAMOS EL TIPO Document y el MessageToJson para la depuración +from google.cloud.documentai_v1.types import Document +from google.protobuf.json_format import MessageToJson + +from .gcp_document_ai_client import process_document_gcp +from .utils import data_cleaner +from core.config import settings + +# --- La función _extract_specific_fields NO necesita cambios en esta fase de depuración --- +# Puedes dejar la versión anterior, ya que el problema está en los datos de entrada que recibe. +def _extract_specific_fields( + document: Document, + default_confidence_override: Optional[float] = None +) -> Dict[str, str]: + # ... (código de la respuesta anterior, no es necesario cambiarlo ahora) + extracted_data = {field: "Not found or low confidence" for field in settings.REQUIRED_FIELDS} + default_threshold = default_confidence_override if default_confidence_override is not None else settings.CONFIDENCE_THRESHOLDS["__default__"] + full_text_lines = document.text.split('\n') + for entity in document.entities: + entity_type = entity.type_ + if entity_type not in settings.REQUIRED_FIELDS or entity_type in ['total_tax_amount', 'subtotal_amount']: + continue + threshold = settings.CONFIDENCE_THRESHOLDS.get(entity_type, default_threshold) + if entity.confidence >= threshold: + raw_text = entity.mention_text.strip() + if entity_type == 'invoice_date': + extracted_data[entity_type] = data_cleaner.normalize_date(raw_text) or f"Unparseable Date: '{raw_text}'" + elif entity_type == 'total_amount': + contextual_line = None + logging.info(f"Buscando contexto para '{raw_text}' con la palabra clave 'Total'") + for line in full_text_lines: + if raw_text in line and "total" in line.lower(): + contextual_line = line + logging.info(f"Contexto definitivo para total_amount encontrado: '{contextual_line}'") + break + text_to_parse = contextual_line if contextual_line else raw_text + parsed_amounts = data_cleaner.parse_total_and_tax(text_to_parse) + total_str = parsed_amounts.get('total_amount') + tax_str = parsed_amounts.get('total_tax_amount') + if total_str: + extracted_data['total_amount'] = total_str + if tax_str: + extracted_data['total_tax_amount'] = tax_str + try: + subtotal = float(total_str) - float(tax_str) + subtotal_str = f"{subtotal:.2f}" + extracted_data['subtotal_amount'] = subtotal_str + extracted_data['net_amount'] = subtotal_str + except (ValueError, TypeError): + logging.error("Error de conversión para cálculo de subtotal.") + else: + extracted_data['total_tax_amount'] = '0.00' + extracted_data['subtotal_amount'] = total_str + if extracted_data.get('net_amount') == "Not found or low confidence": + extracted_data['net_amount'] = total_str + elif entity_type in ['net_amount', 'subtotal_amount']: + if extracted_data.get(entity_type) == "Not found or low confidence": + extracted_data[entity_type] = data_cleaner.clean_numeric_value(raw_text) + else: + extracted_data[entity_type] = raw_text.replace('\n', ' ').strip() + return extracted_data + + +def process_invoice_from_bytes( + file_bytes: bytes, + mime_type: str, + default_confidence_override: Optional[float] = None +) -> Dict[str, str]: + """ Orquesta el proceso completo e imprime la respuesta de GCP para depuración. """ + try: + document = process_document_gcp( + project_id=settings.GCP_PROJECT_ID, + location=settings.GCP_LOCATION, + processor_id=settings.DOCAI_PROCESSOR_ID, + file_bytes=file_bytes, + mime_type=mime_type, + ) + + # --- INICIO DEL BLOQUE DE DEPURACIÓN --- + # Convertimos la respuesta completa del objeto 'Document' a un JSON legible. + document_json = MessageToJson(document._pb) + # Lo cargamos como un objeto Python para poder formatearlo bonito (indentado). + document_dict = json.loads(document_json) + + # Imprimimos en el log de la consola con un formato claro. + logging.critical("\n\n" + "="*20 + " INICIO RESPUESTA COMPLETA DE DOCUMENT AI " + "="*20) + logging.critical(json.dumps(document_dict, indent=2, ensure_ascii=False)) + logging.critical("="*20 + " FIN RESPUESTA COMPLETA DE DOCUMENT AI " + "="*20 + "\n\n") + # --- FIN DEL BLOQUE DE DEPURACIÓN --- + + validated_data = _extract_specific_fields(document, default_confidence_override) + logging.info(f"Datos finales procesados: {validated_data}") + return validated_data + + except Exception as e: + logging.error(f"Error en el flujo de procesamiento de factura: {e}", exc_info=True) + raise \ No newline at end of file