invoice-processing-google-d.../services/invoice_processor_service.py

68 lines
2.7 KiB
Python

# services/invoice_processor_service.py
import logging
from typing import Dict, List, Any, Optional
from .gcp_document_ai_client import process_document_gcp
from .utils import data_cleaner
from core.config import settings
def _extract_specific_fields(
entities: List[Any],
# El nombre del parámetro aquí debe coincidir con el que se le pasa desde el router
default_confidence_override: Optional[float] = None
) -> Dict[str, str]:
"""
Filtra y normaliza entidades. Si se proporciona `default_confidence_override`,
se utiliza para el umbral por defecto. Los umbrales específicos de la
configuración siempre tienen prioridad.
"""
extracted_data = {field: "Not found or low confidence" for field in settings.REQUIRED_FIELDS}
# Determina el umbral por defecto para esta ejecución
default_threshold = default_confidence_override if default_confidence_override is not None else settings.CONFIDENCE_THRESHOLDS["__default__"]
for entity in entities:
entity_type = entity.type_
if entity_type in settings.REQUIRED_FIELDS:
# Lógica corregida: Prioriza el umbral específico del campo, si no, usa el por defecto.
threshold = settings.CONFIDENCE_THRESHOLDS.get(entity_type, default_threshold)
if entity.confidence >= threshold:
value = entity.mention_text.replace('\n', ' ').strip()
if entity_type == 'invoice_date':
normalized_date = data_cleaner.normalize_date(value)
value = normalized_date if normalized_date else f"Unparseable Date: '{value}'"
extracted_data[entity_type] = value
return extracted_data
def process_invoice_from_bytes(
file_bytes: bytes,
mime_type: str,
# El nombre del parámetro aquí debe coincidir con el del router
default_confidence_override: Optional[float] = None
) -> Dict[str, str]:
"""
Orquesta el proceso completo.
"""
try:
document = process_document_gcp(
project_id=settings.GCP_PROJECT_ID,
location=settings.GCP_LOCATION,
processor_id=settings.DOCAI_PROCESSOR_ID,
file_bytes=file_bytes,
mime_type=mime_type,
)
validated_data = _extract_specific_fields(document.entities, default_confidence_override)
log_threshold = default_confidence_override if default_confidence_override is not None else "config default"
logging.info(f"Documento procesado. Umbral por defecto usado: {log_threshold}.")
return validated_data
except Exception as e:
logging.error(f"Error en el flujo de procesamiento de factura: {e}", exc_info=True)
raise