commit ca8848a9c86cb777a8e63bcc8f9201f0c545c2c9 Author: Daniel Oscar Zamo Date: Tue Aug 26 12:26:03 2025 +0200 first commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f0e4b5a --- /dev/null +++ b/.gitignore @@ -0,0 +1,81 @@ +# =================================================================== +# Mis archivos extras (en lo posible, estructura de marco de trabajo ACE) +# =================================================================== ++/ +docs/ # Por ahora como mkdocs no esta, no hago el push +data/ +!data/.gitkeep +*beta* +#*.py +*y.v* + +# Archivos de construcción de PyInstaller +/dist +/build +*.spec + +# =================================================================== +# 1. SECRETOS Y CONFIGURACIÓN LOCAL +# ¡Nunca subir claves de API, contraseñas u otros secretos! +# =================================================================== +.env +.env.* +#>!.env.example + + +# =================================================================== +# 2. ENTORNOS VIRTUALES +# Se pueden recrear a partir de requirements.txt, no deben estar en el repo. +# =================================================================== +.venv/ +venv/ +env/ + + +# =================================================================== +# 3. FICHEROS COMPILADOS Y CACHE DE PYTHON +# Generados automáticamente por el intérprete de Python. +# =================================================================== +__pycache__/ +*.pyc +*.pyo +*.pyd + + +# =================================================================== +# 4. FICHEROS DE IDEs Y EDITORES DE CÓDIGO +# Configuración específica del entorno de desarrollo de cada persona. +# =================================================================== +.vscode/ +.idea/ +*.swp +*.swo + + +# =================================================================== +# 5. FICHEROS DEL SISTEMA OPERATIVO +# Metadatos y archivos basura de macOS, Windows y Linux. +# =================================================================== +.DS_Store +Thumbs.db +desktop.ini + + +# =================================================================== +# 6. PAQUETES Y DISTRIBUCIÓN +# Directorios generados al crear un paquete instalable (pip). +# =================================================================== +build/ +dist/ +*.egg-info/ + + +# =================================================================== +# 7. LOGS Y REPORTES +# Archivos de registro que se generan durante la ejecución. +# =================================================================== +*.log +logs/ +htmlcov/ +.pytest_cache/ +.coverage diff --git a/.markdownlint.json b/.markdownlint.json new file mode 100644 index 0000000..2de83f0 --- /dev/null +++ b/.markdownlint.json @@ -0,0 +1,17 @@ +{ + "MD010": false, + "MD041": false, + "MD047": false, + "MD007": false, + "MD012": false, + "MD013": false, + "MD025": false, + "MD028": false, + "MD029": false, + "MD031": false, + "MD030": false, + "MD033": false, + "MD034": false, + "MD036": false, + "MD040": false +} diff --git a/README.md b/README.md new file mode 100644 index 0000000..b7c654d --- /dev/null +++ b/README.md @@ -0,0 +1,5 @@ +- [ ] Repositorio en mi gitea +- [ ] Integrar los dos repos en uno solo y alojarlos en este repositorio +- [ ] Codificar un front end, primero basado en para Python +- [ ] Reimplementar para API +- [ ] Recodificar para con PHP (basado en algun framework diff --git a/api/__init__.py b/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/api/dependencies.py b/api/dependencies.py new file mode 100644 index 0000000..2663502 --- /dev/null +++ b/api/dependencies.py @@ -0,0 +1,58 @@ +# api/dependencies.py +from fastapi import Depends, HTTPException, status +from fastapi.security import OAuth2PasswordBearer +from sqlalchemy.orm import Session + +from core import security +from db import crud, models +from db.database import get_db + +# ============================ ¡EL CAMBIO CLAVE ESTÁ AQUÍ! ============================ + +# GUARDIA ESTRICTO: Para endpoints protegidos. Si no hay token, lanza un error 401. +oauth2_scheme_strict = OAuth2PasswordBearer(tokenUrl="/api/users/token") + +# GUARDIA PERMISIVO: Para endpoints opcionales. Si no hay token, NO lanza error. +oauth2_scheme_optional = OAuth2PasswordBearer(tokenUrl="/api/users/token", auto_error=False) + +# =================================================================================== + +def get_current_user( + db: Session = Depends(get_db), token: str = Depends(oauth2_scheme_strict) # <-- Usa el guardia estricto +) -> models.User: + credentials_exception = HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Could not validate credentials", + headers={"WWW-Authenticate": "Bearer"}, + ) + + username = security.decode_access_token(token) + if username is None: + raise credentials_exception + + user = crud.get_user_by_username(db, username=username) + if user is None: + raise credentials_exception + + return user + +def get_current_active_user( + current_user: models.User = Depends(get_current_user) +) -> models.User: + if not current_user.is_active: + raise HTTPException(status_code=400, detail="Inactive user") + return current_user + +def get_current_user_optional( + db: Session = Depends(get_db), token: str | None = Depends(oauth2_scheme_optional) # <-- Usa el guardia permisivo +) -> models.User | None: + if not token: + return None + try: + username = security.decode_access_token(token) + if username is None: + return None + user = crud.get_user_by_username(db, username=username) + return user + except Exception: + return None diff --git a/api/routers/__init__.py b/api/routers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/api/routers/invoices.py b/api/routers/invoices.py new file mode 100644 index 0000000..cdce728 --- /dev/null +++ b/api/routers/invoices.py @@ -0,0 +1,35 @@ +# api/routers/invoices.py +from fastapi import APIRouter, Depends, UploadFile, File, HTTPException +from typing import Dict + +from api.dependencies import get_current_active_user +from services import invoice_processor_service +from core.config import settings +from db.models import User + +router = APIRouter() + +@router.post("/upload", response_model=Dict[str, str]) +async def upload_invoice( + file: UploadFile = File(...), + current_user: User = Depends(get_current_active_user) +): + """ + Endpoint para subir una factura, procesarla y devolver los datos extraídos. + Requiere autenticación. + """ + if not file.content_type in ["application/pdf", "image/jpeg", "image/png"]: + raise HTTPException(status_code=400, detail="Tipo de archivo no soportado.") + + try: + file_bytes = await file.read() + extracted_data = invoice_processor_service.process_invoice_from_bytes( + project_id=settings.GCP_PROJECT_ID, + location=settings.GCP_LOCATION, + processor_id=settings.DOCAI_PROCESSOR_ID, + file_bytes=file_bytes, + mime_type=file.content_type + ) + return extracted_data + except Exception as e: + raise HTTPException(status_code=500, detail=f"Error al procesar la factura: {e}") diff --git a/api/routers/users.py b/api/routers/users.py new file mode 100644 index 0000000..5555d18 --- /dev/null +++ b/api/routers/users.py @@ -0,0 +1,62 @@ +# api/routers/users.py +from fastapi import APIRouter, Depends, HTTPException, status +from fastapi.security import OAuth2PasswordRequestForm +from sqlalchemy.orm import Session +from datetime import timedelta + +from api import schemas +from db import crud +from db.database import get_db +from core import security +from core.config import settings + +router = APIRouter() + +# ================== ¡ENDPOINT DE REGISTRO! ================== +# Este es el endpoint que faltaba y que arregla el error 404 +@router.post("/register", response_model=schemas.User) +def register_user(user: schemas.UserCreate, db: Session = Depends(get_db)): + """ + Crea un nuevo usuario en la base de datos. + """ + db_user = crud.get_user_by_username(db, username=user.username) + if db_user: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="Username already registered" + ) + + db_user_email = crud.get_user_by_email(db, email=user.email) + if db_user_email: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="Email already registered" + ) + + return crud.create_user(db=db, user=user) +# ============================================================= + + +# ================== ENDPOINT DE LOGIN ================== +@router.post("/token", response_model=schemas.Token) +def login_for_access_token( + db: Session = Depends(get_db), + form_data: OAuth2PasswordRequestForm = Depends() +): + """ + Procesa el formulario de login y devuelve un token de acceso JWT. + """ + user = crud.authenticate_user(db, username=form_data.username, password=form_data.password) + if not user: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Incorrect username or password", + headers={"WWW-Authenticate": "Bearer"}, + ) + + access_token_expires = timedelta(minutes=settings.ACCESS_TOKEN_EXPIRE_MINUTES) + access_token = security.create_access_token( + data={"sub": user.username}, expires_delta=access_token_expires + ) + + return {"access_token": access_token, "token_type": "bearer"} diff --git a/api/schemas.py b/api/schemas.py new file mode 100644 index 0000000..3da610a --- /dev/null +++ b/api/schemas.py @@ -0,0 +1,31 @@ +# api/schemas.py +from pydantic import BaseModel, EmailStr +from typing import Optional + +# --- Token Schemas --- +class Token(BaseModel): + access_token: str + token_type: str + +class TokenData(BaseModel): + username: Optional[str] = None + +# --- User Schemas --- + +# Propiedades base del usuario (compartidas) +class UserBase(BaseModel): + username: str + email: EmailStr + +# Esquema para la creación de un usuario (recibe la contraseña) +class UserCreate(UserBase): + password: str + +# Esquema para leer/devolver un usuario desde la API (nunca incluye la contraseña) +class User(UserBase): + id: int + is_active: bool + + class Config: + # Permite que Pydantic lea los datos directamente desde un objeto ORM de SQLAlchemy + from_attributes = True diff --git a/app.db b/app.db new file mode 100644 index 0000000..58756af Binary files /dev/null and b/app.db differ diff --git a/core/__init__.py b/core/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/core/config.py b/core/config.py new file mode 100644 index 0000000..364bad9 --- /dev/null +++ b/core/config.py @@ -0,0 +1,34 @@ +# core/config.py +from pydantic_settings import BaseSettings, SettingsConfigDict +from typing import List, Dict + +class Settings(BaseSettings): + # Carga las variables desde un fichero .env + model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8") + + # --- Configuración de Seguridad --- + SECRET_KEY: str + ACCESS_TOKEN_EXPIRE_MINUTES: int = 30 + + # --- Configuración de Google Cloud Document AI --- + GCP_PROJECT_ID: str + GCP_LOCATION: str + DOCAI_PROCESSOR_ID: str + + # --- Lógica de Negocio (extraída del antiguo config.py) --- + REQUIRED_FIELDS: List[str] = [ + "supplier_name", + "invoice_id", + "invoice_date", + "total_amount" + ] + + # Umbrales de confianza por campo. Un valor por defecto y anulaciones específicas. + CONFIDENCE_THRESHOLDS: Dict[str, float] = { + "__default__": 0.85, + "supplier_name": 0.90, + "total_amount": 0.95 + } + +# Creamos una única instancia global de la configuración +settings = Settings() diff --git a/core/security.py b/core/security.py new file mode 100644 index 0000000..a394928 --- /dev/null +++ b/core/security.py @@ -0,0 +1,50 @@ +# core/security.py +from datetime import datetime, timedelta, timezone +from typing import Optional + +from jose import JWTError, jwt +from passlib.context import CryptContext + +from .config import settings + +# Configuración para el hashing de contraseñas +# Usamos bcrypt, que es el estándar recomendado. +pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto") + +# ALGORITHM y SECRET_KEY deben coincidir con los de tu configuración +ALGORITHM = "HS256" + +def verify_password(plain_password: str, hashed_password: str) -> bool: + """Verifica si una contraseña en texto plano coincide con su hash.""" + return pwd_context.verify(plain_password, hashed_password) + +def get_password_hash(password: str) -> str: + """Genera el hash de una contraseña.""" + return pwd_context.hash(password) + +def create_access_token(data: dict, expires_delta: Optional[timedelta] = None) -> str: + """Crea un nuevo token de acceso JWT.""" + to_encode = data.copy() + if expires_delta: + expire = datetime.now(timezone.utc) + expires_delta + else: + # Por defecto, el token expira en 15 minutos + expire = datetime.now(timezone.utc) + timedelta(minutes=15) + + to_encode.update({"exp": expire}) + encoded_jwt = jwt.encode(to_encode, settings.SECRET_KEY, algorithm=ALGORITHM) + return encoded_jwt + +def decode_access_token(token: str) -> Optional[str]: + """ + Decodifica un token de acceso y devuelve el nombre de usuario (subject). + Retorna None si el token es inválido o ha expirado. + """ + try: + payload = jwt.decode(token, settings.SECRET_KEY, algorithms=[ALGORITHM]) + username: Optional[str] = payload.get("sub") + if username is None: + return None + return username + except JWTError: + return None diff --git a/db/__init__.py b/db/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/db/crud.py b/db/crud.py new file mode 100644 index 0000000..c2e4bc2 --- /dev/null +++ b/db/crud.py @@ -0,0 +1,57 @@ +# db/crud.py +from sqlalchemy.orm import Session +from . import models +from api import schemas +from core.security import get_password_hash, verify_password + +# --- User CRUD --- + +def get_user(db: Session, user_id: int): + """Obtiene un usuario por su ID.""" + return db.query(models.User).filter(models.User.id == user_id).first() + +def get_user_by_username(db: Session, username: str): + """Obtiene un usuario por su nombre de usuario.""" + return db.query(models.User).filter(models.User.username == username).first() + +def get_user_by_email(db: Session, email: str): + """Obtiene un usuario por su email.""" + return db.query(models.User).filter(models.User.email == email).first() + +def get_users(db: Session, skip: int = 0, limit: int = 100): + """Obtiene una lista de usuarios con paginación.""" + return db.query(models.User).offset(skip).limit(limit).all() + +def create_user(db: Session, user: schemas.UserCreate): + """Crea un nuevo usuario en la base de datos.""" + hashed_password = get_password_hash(user.password) + db_user = models.User( + username=user.username, + email=user.email, + hashed_password=hashed_password + ) + db.add(db_user) + db.commit() + db.refresh(db_user) + return db_user + +def update_user_password(db: Session, user: models.User, new_password: str): + """Actualiza la contraseña de un usuario.""" + hashed_password = get_password_hash(new_password) + user.hashed_password = hashed_password + db.add(user) + db.commit() + db.refresh(user) + return user + +def authenticate_user(db: Session, username: str, password: str): + """ + Autentica a un usuario. Retorna el objeto de usuario si es exitoso, + de lo contrario, retorna None. + """ + user = get_user_by_username(db, username) + if not user: + return None + if not verify_password(password, user.hashed_password): + return None + return user diff --git a/db/database.py b/db/database.py new file mode 100644 index 0000000..c89d2db --- /dev/null +++ b/db/database.py @@ -0,0 +1,30 @@ +# db/database.py +from sqlalchemy import create_engine +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import sessionmaker + +# Usamos SQLite, que guarda la base de datos en un fichero local "app.db". +SQLALCHEMY_DATABASE_URL = "sqlite:///./app.db" + +# El engine es el punto de entrada a la base de datos. +# connect_args es necesario solo para SQLite para permitir que se use en múltiples hilos (como lo hace FastAPI). +engine = create_engine( + SQLALCHEMY_DATABASE_URL, connect_args={"check_same_thread": False} +) + +# Cada instancia de SessionLocal será una sesión de base de datos. +SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) + +# Base será la clase de la que heredarán nuestros modelos de SQLAlchemy (como la clase User en models.py). +Base = declarative_base() + +# --- Función de dependencia para los endpoints --- +def get_db(): + """ + Dependencia de FastAPI que crea y gestiona una sesión de BD por cada request. + """ + db = SessionLocal() + try: + yield db + finally: + db.close() diff --git a/db/models.py b/db/models.py new file mode 100644 index 0000000..fc2fcd6 --- /dev/null +++ b/db/models.py @@ -0,0 +1,12 @@ +# db/models.py +from sqlalchemy import Boolean, Column, Integer, String +from .database import Base + +class User(Base): + __tablename__ = "users" + + id = Column(Integer, primary_key=True, index=True) + username = Column(String, unique=True, index=True, nullable=False) + email = Column(String, unique=True, index=True, nullable=False) + hashed_password = Column(String, nullable=False) + is_active = Column(Boolean, default=True) diff --git a/main.py b/main.py new file mode 100644 index 0000000..364b3bf --- /dev/null +++ b/main.py @@ -0,0 +1,41 @@ +# main.py +from fastapi import FastAPI, Request, Depends +from fastapi.responses import HTMLResponse +from fastapi.templating import Jinja2Templates +from fastapi.staticfiles import StaticFiles +from db.models import User + +from api.dependencies import get_current_user_optional +from api.routers import users, invoices +from db.database import engine, Base + +Base.metadata.create_all(bind=engine) + +app = FastAPI(title="ACME Invoice Processor") + +app.mount("/static", StaticFiles(directory="static"), name="static") +templates = Jinja2Templates(directory="templates") + +app.include_router(users.router, prefix="/api/users", tags=["Users"]) +app.include_router(invoices.router, prefix="/api/invoices", tags=["Invoices"]) + +# --- RUTAS DE PLANTILLAS HTML --- + +@app.get("/", response_class=HTMLResponse) +async def read_login_page(request: Request): + """ + Esta ruta ahora SOLO sirve la página de login. + """ + return templates.TemplateResponse("login.html", {"request": request}) + +@app.get("/dashboard", response_class=HTMLResponse) +async def read_dashboard(request: Request): + """ + Esta es la nueva ruta dedicada para el dashboard. + La seguridad se manejará en el propio HTML con JavaScript. + """ + return templates.TemplateResponse("dashboard.html", {"request": request}) + +@app.get("/register", response_class=HTMLResponse) +async def register_page(request: Request): + return templates.TemplateResponse("register.html", {"request": request}) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..696d4a9 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,29 @@ +# pyproject.toml +[build-system] +requires = ["setuptools>=61.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "acme-invoice-processor" +version = "1.0.0" +authors = [ + { name = "Daniel Oscar Zamo", email = "daniel.oscar.zamo@gmail.com" }, +] +description = "Aplicación autónoma para la extracción de datos de facturas de Acme Inc. utilizando Google Document AI." +readme = "README.md" +requires-python = ">=3.8" +classifiers = [ + "Programming Language :: Python :: 3", +] +# Todas las dependencias ahora están aquí +dependencies = [ + "google-cloud-documentai==3.5.0", + "python-dotenv", + "pyinstaller", + "python-dateutil", + # Las dependencias transitivas de google-cloud-documentai serán gestionadas por pip, + # pero es buena práctica listarlas explícitamente si se requiere fijar versiones. +] + +[tool.setuptools.packages.find] +where = ["src"] diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..a620177 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,46 @@ +# requirements.txt +# Dependencias de la aplicación y del toolkit unificadas + +# Framework Web y Servidor +fastapi +uvicorn[standard] + +# Base de datos (ORM) +sqlalchemy +# Para usar SQLite (simple para empezar) +pydantic-settings + +# Autenticación y Seguridad +python-jose[cryptography] +passlib[bcrypt] +python-multipart # Para subida de archivos + +# Plantillas HTML +jinja2 + +# Dependencias directas +google-cloud-documentai==3.5.0 +python-dotenv +pyinstaller +python-dateutil + +# Dependencias transitivas (fijadas para consistencia, tomadas del toolkit) +cachetools==5.5.2 +certifi==2025.8.3 +charset-normalizer==3.4.3 +google-api-core==2.25.1 +google-auth==2.40.3 +googleapis-common-protos==1.70.0 +grpcio==1.74.0 +grpcio-status==1.74.0 +idna==3.10 +proto-plus==1.26.1 +protobuf==6.32.0 +pyasn1==0.6.1 +pyasn1_modules==0.4.2 +requests==2.32.4 +rsa==4.9.1 +urllib3==2.5.0 + +# Validación de email para Pydantic +email-validator diff --git a/services/__init__.py b/services/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/services/gcp_document_ai_client.py b/services/gcp_document_ai_client.py new file mode 100644 index 0000000..191b34f --- /dev/null +++ b/services/gcp_document_ai_client.py @@ -0,0 +1,36 @@ +# services/gcp_document_ai_client.py +from google.api_core.client_options import ClientOptions +from google.api_core.exceptions import GoogleAPICallError +from google.cloud import documentai + +def process_document_gcp( + project_id: str, + location: str, + processor_id: str, + file_bytes: bytes, + mime_type: str, +) -> documentai.Document: + """ + Procesa el contenido de un documento en bytes usando la API de Google Document AI. + Esta función ahora solo se encarga de la comunicación con GCP. + """ + try: + client_options = ClientOptions( + api_endpoint=f"{location}-documentai.googleapis.com" + ) + client = documentai.DocumentProcessorServiceClient(client_options=client_options) + + resource_name = client.processor_path(project_id, location, processor_id) + + raw_document = documentai.RawDocument( + content=file_bytes, mime_type=mime_type + ) + request = documentai.ProcessRequest(name=resource_name, raw_document=raw_document) + + result = client.process_document(request=request) + return result.document + + except GoogleAPICallError as e: + raise GoogleAPICallError(f"API call failed: {e}") from e + except Exception as e: + raise Exception(f"An unexpected error occurred during GCP processing: {e}") from e diff --git a/services/invoice_processor_service.py b/services/invoice_processor_service.py new file mode 100644 index 0000000..6fd3165 --- /dev/null +++ b/services/invoice_processor_service.py @@ -0,0 +1,55 @@ +# services/invoice_processor_service.py +import logging +from typing import Dict, List, Any + +# Importamos nuestro nuevo cliente GCP de forma local y limpia +from .gcp_document_ai_client import process_document_gcp + +# (Opcional, si tienes utilidades) from .utils import data_cleaner + +# Importamos la configuración centralizada +from core.config import settings + +# --- Lógica de negocio extraída del antiguo processing.py --- +def _get_confidence_threshold_for_field(field_type: str) -> float: + return settings.CONFIDENCE_THRESHOLDS.get(field_type, settings.CONFIDENCE_THRESHOLDS["__default__"]) + +def _extract_specific_fields(entities: List[Any]) -> Dict[str, str]: + extracted_data = {field: "Not found or low confidence" for field in settings.REQUIRED_FIELDS} + + for entity in entities: + entity_type = entity.type_ + threshold = _get_confidence_threshold_for_field(entity_type) + + if entity_type in settings.REQUIRED_FIELDS and entity.confidence >= threshold: + value = entity.mention_text.replace('\n', ' ').strip() + # if entity_type == 'invoice_date': + # value = data_cleaner.normalize_date(value) or f"Unparseable Date: '{value}'" + extracted_data[entity_type] = value + return extracted_data + +# --- Función principal del servicio --- +def process_invoice_from_bytes(file_bytes: bytes, mime_type: str) -> Dict[str, str]: + """ + Orquesta el proceso completo: llama a Document AI, extrae y limpia los datos. + """ + try: + # 1. Llamar a la API de Google a través de nuestro cliente dedicado + document = process_document_gcp( + project_id=settings.GCP_PROJECT_ID, + location=settings.GCP_LOCATION, + processor_id=settings.DOCAI_PROCESSOR_ID, + file_bytes=file_bytes, + mime_type=mime_type, + ) + + # 2. Aplicar la lógica de negocio para extraer y validar los campos + validated_data = _extract_specific_fields(document.entities) + + logging.info("Documento procesado con éxito y datos validados.") + return validated_data + + except Exception as e: + logging.error(f"Error en el flujo de procesamiento de factura: {e}", exc_info=True) + # Re-lanzamos la excepción para que el endpoint de la API la capture y devuelva un 500 + raise diff --git a/services/utils/__init__.py b/services/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/services/utils/data_cleaner.py b/services/utils/data_cleaner.py new file mode 100644 index 0000000..83a78bb --- /dev/null +++ b/services/utils/data_cleaner.py @@ -0,0 +1,87 @@ +# src/cli_invoice_processor/data_cleaner.py +import logging +import locale +from dateutil import parser +from typing import Optional +from datetime import datetime + +SPANISH_TO_ENGLISH_MONTHS = { + 'enero': 'january', + 'febrero': 'february', + 'marzo': 'march', + 'abril': 'april', + 'mayo': 'may', + 'junio': 'june', + 'julio': 'july', + 'agosto': 'august', + 'septiembre': 'september', + 'octubre': 'october', + 'noviembre': 'november', + 'diciembre': 'december' +} + +def _parse_with_fallback(date_string: str) -> Optional[datetime]: + """ + Intenta parsear la fecha usando un fallback manual que primero limpia + preposiciones comunes en español ("de", "del") y luego traduce los meses. + """ + # 1. Normalizar a minúsculas para trabajar de forma consistente + temp_string = date_string.lower() + + # 2. Traducir el mes de español a inglés + for spa, eng in SPANISH_TO_ENGLISH_MONTHS.items(): + if spa in temp_string: + temp_string = temp_string.replace(spa, eng) + break # Salimos del bucle una vez que encontramos y reemplazamos el mes + + # 3. Eliminar preposiciones comunes, cuidando los espacios para evitar unir palabras + temp_string = temp_string.replace(' de ', ' ') + temp_string = temp_string.replace(' del ', ' ') + + # Después de la limpieza, la cadena debería ser algo como '5 january 2030', que es parseable. + try: + logging.info(f"Attempting to parse cleaned date string: '{temp_string}'") + return parser.parse(temp_string) + except (parser.ParserError, ValueError): + # Si incluso después de la limpieza falla, no podemos hacer más. + logging.warning(f"Fallback parsing failed even for cleaned string: '{temp_string}'") + return None + +def normalize_date(date_string: str) -> Optional[str]: + """ + Parses a date string from various formats and normalizes it to DD/MM/YYYY. + It first tries using Spanish locale, and if it fails, it uses a manual + cleaning and translation fallback. + """ + if not date_string: + return None + + original_locale = locale.getlocale(locale.LC_TIME) + parsed_date = None + + # Estrategia 1: Intentar con el locale español + try: + try: + locale.setlocale(locale.LC_TIME, 'es_ES.UTF-8') + except locale.Error: + locale.setlocale(locale.LC_TIME, 'Spanish') + + parsed_date = parser.parse(date_string) + + except (parser.ParserError, ValueError, locale.Error): + logging.warning(f"Could not parse date '{date_string}' using Spanish locale. Attempting robust fallback.") + # Estrategia 2: Si el locale falla, usar el fallback robusto + parsed_date = _parse_with_fallback(date_string) + + finally: + # Siempre restauramos el locale original + locale.setlocale(locale.LC_TIME, original_locale) + + if parsed_date: + # Aquí se asegura el formato DD/MM/AAAA. + # '%d' -> día con cero (05), '%m' -> mes con cero (01), '%Y' -> año (2030) + return parsed_date.strftime('%d/%m/%Y') + else: + # Si ambas estrategias fallan, registramos el error final + logging.error(f"Failed to parse date '{date_string}' with all available methods.") + return None \ No newline at end of file diff --git a/templates/dashboard.html b/templates/dashboard.html new file mode 100644 index 0000000..eb2e7f3 --- /dev/null +++ b/templates/dashboard.html @@ -0,0 +1,64 @@ + + + + + + Dashboard - ACME Invoice Processor + + + + + + + + + + + +
+

Sube una Factura para Procesar

+
+ + +
+
+
+

Resultados de la Extracción:

+

+        
+
+ + + + diff --git a/templates/login.html b/templates/login.html new file mode 100644 index 0000000..c47db07 --- /dev/null +++ b/templates/login.html @@ -0,0 +1,91 @@ + + + + + + Iniciar Sesión - ACME Invoice Processor + + + +
+

Iniciar Sesión

+
+
+ + +
+
+ + +
+ + +
+

¿No tienes una cuenta? Regístrate

+
+
+ + + + + diff --git a/templates/register.html b/templates/register.html new file mode 100644 index 0000000..a879d78 --- /dev/null +++ b/templates/register.html @@ -0,0 +1,103 @@ + + + + + + Registro - ACME Invoice Processor + + + +
+

Registrar Nuevo Usuario

+ +
+
+ + +
+
+ + +
+
+ + +
+ +
+

¿Ya tienes una cuenta? Inicia sesión

+ + +
+
+ + + + + +