73 lines
2.7 KiB
Python
73 lines
2.7 KiB
Python
# services/invoice_processor_service.py
|
|
import logging
|
|
from typing import Dict, List, Any, Optional
|
|
|
|
from .gcp_document_ai_client import process_document_gcp
|
|
# Importamos nuestro limpiador de datos para usarlo
|
|
from .utils import data_cleaner
|
|
from core.config import settings
|
|
|
|
# --- Lógica de negocio refactorizada ---
|
|
|
|
def _extract_specific_fields(
|
|
entities: List[Any],
|
|
override_threshold: Optional[float] = None
|
|
) -> Dict[str, str]:
|
|
"""
|
|
Filtra y normaliza entidades. Si se proporciona `override_threshold`,
|
|
se utiliza ese valor para todos los campos. De lo contrario, utiliza
|
|
los umbrales definidos en la configuración.
|
|
"""
|
|
extracted_data = {field: "Not found or low confidence" for field in settings.REQUIRED_FIELDS}
|
|
|
|
for entity in entities:
|
|
entity_type = entity.type_
|
|
|
|
# Lógica de decisión del umbral
|
|
if override_threshold is not None:
|
|
threshold = override_threshold
|
|
else:
|
|
# Comportamiento original: usar la configuración por campo
|
|
threshold = settings.CONFIDENCE_THRESHOLDS.get(entity_type, settings.CONFIDENCE_THRESHOLDS["__default__"])
|
|
|
|
if entity_type in settings.REQUIRED_FIELDS and entity.confidence >= threshold:
|
|
value = entity.mention_text.replace('\n', ' ').strip()
|
|
|
|
# Reactivamos la limpieza de fechas
|
|
if entity_type == 'invoice_date':
|
|
normalized_date = data_cleaner.normalize_date(value)
|
|
value = normalized_date if normalized_date else f"Unparseable Date: '{value}'"
|
|
|
|
extracted_data[entity_type] = value
|
|
|
|
return extracted_data
|
|
|
|
# --- Función principal del servicio actualizada ---
|
|
|
|
def process_invoice_from_bytes(
|
|
file_bytes: bytes,
|
|
mime_type: str,
|
|
override_threshold: Optional[float] = None # Nuevo parámetro opcional
|
|
) -> Dict[str, str]:
|
|
"""
|
|
Orquesta el proceso completo. Ahora pasa el umbral de confianza
|
|
opcional a la capa de lógica de negocio.
|
|
"""
|
|
try:
|
|
document = process_document_gcp(
|
|
project_id=settings.GCP_PROJECT_ID,
|
|
location=settings.GCP_LOCATION,
|
|
processor_id=settings.DOCAI_PROCESSOR_ID,
|
|
file_bytes=file_bytes,
|
|
mime_type=mime_type,
|
|
)
|
|
|
|
# Pasamos el umbral opcional a la función de extracción
|
|
validated_data = _extract_specific_fields(document.entities, override_threshold)
|
|
|
|
logging.info(f"Documento procesado con éxito con un umbral de {override_threshold or 'default'}.")
|
|
return validated_data
|
|
|
|
except Exception as e:
|
|
logging.error(f"Error en el flujo de procesamiento de factura: {e}", exc_info=True)
|
|
raise |