import sys, json, re
sys.stdout.reconfigure(encoding='utf-8')

APOS_CURLY = '\u2019'
APOS_LEFT  = '\u2018'

with open("pdf_texts.json", "r", encoding="utf-8") as f:
    data = json.load(f)

rules = {
    'AGL': ['self-billing invoice', 'self billing'],
    'Commissions': [
        'frais de vente', 'selling on amazon fees', 'selling on amazon',
        'amazon services europe', 'seller fees', 'verkaufen bei amazon',
        'commissioni di vendita', 'tarifas de vender en amazon',
        'verkoopkosten', 'merchant vat', 'steuergutschrift',
        'refunded fees', 'ais_refunded_fees_text', 'ais_selling_on_amazon_fees_text',
        'credit note', "note de credit d'impot", "note de credit d impot",
        'nota di credito', 'nota de credito', 'gutschrift'
    ],
    'Transport': [
        "frais d'expedition par amazon", "frais d'expedition",
        "frais d expédition par amazon", "frais d expédition",
        'fulfillment by amazon', 'fulfilment by amazon', 'fba fees',
        'ais_fulfillment_by_amazon_fees_text',
        'versand durch amazon', 'gebühren im zusammenhang mit',
        'commissioni di logistica', 'tarifas de logistica', 'tarifas de log',
        'kosten voor fulfillment', 'koszty realizacji',
        'shipping', 'livraison'
    ],
    'Stockage': [
        'stockage', 'storage fees', 'inventory storage',
        'lagergebühren', 'spese di stoccaggio', 'tarifas de almacenamiento',
        'opslagkosten', 'magazijnkosten'
    ],
    'Publicite': ['advertising', 'publicité', 'sponsored'],
    'Emballage': ['packaging', 'emballage'],
    'Technologie': ['technology fee', 'technologie'],
    'REP': ['extended producer responsibility', 'mandataire fiscal', 'eco-contributions'],
    'UPS': ['ups'],
    'Remboursement': ['reimbursement', 'lost inventory', 'damaged inventory', 'remboursement'],
    'Ventes': ['rapport de ventes', 'sales report', 'merchant tax report', 'amazon-vat-res',
               'b2b-b2c', 'numéro de commande', 'asin-', 'date de livraison', 'prix unitaire (inclus tva)']
}

ok = 0
fail = 0

for item in data:
    # Normalize typographic apostrophes (ROOT CAUSE of many classification failures)
    text = item['text'].replace(APOS_CURLY, "'").replace(APOS_LEFT, "'").replace('\n', ' ')
    clean = text.lower()

    doc_type = 'Inconnu'
    for k, v in rules.items():
        if any(kw in clean for kw in v):
            doc_type = k
            break

    spacelessText = re.sub(r'\s+', '', text).lower()
    spacelessText = re.sub(r"eur|gbp|usd|pln|sek|dkk|czk|huf|ron|chf|bgn|\u20ac|\xa3|\$|z\u0142", '', spacelessText)

    amounts = {'TTC': None}
    slMatch = re.search(r'(-?\d+[.,]\d{2})(-?\d+[.,]\d{1,2})%(-?\d+[.,]\d{2})(-?\d+[.,]\d{2})', spacelessText)
    if slMatch:
        amounts['TTC'] = slMatch.group(4)
    else:
        m = re.search(r'(?:total|gesamtsumme|totaal|totale)[^\d-]*?(-?\d+[.,]\d{2})[^\d-]*?(-?\d+[.,]\d{2})[^\d-]*?(-?\d+[.,]\d{2})', text, re.I)
        if m:
            amounts['TTC'] = m.group(3)

    inv = re.search(r"(?:facture n|invoice number|credit note number|rechnungsnummer|n.mero de la factura|num.ro de la facture|numero fattura|factuurnummer|numero nota di credito)[\s:]*([A-Z0-9\-\_]{5,25})", text, re.I)
    if not inv:
        inv = re.search(r'(?:invoice|facture|rechnung|factura|fattura|factuur)[\s\:\#]*([A-Z0-9\-\_]{5,25})', text, re.I)

    if doc_type == 'Inconnu' or amounts['TTC'] is None or not inv:
        fail += 1
        print(f'FAIL [{doc_type}]: {item["file"]}')
        if doc_type == 'Inconnu':
            print(f'  -> {text[:200]}')
        if not inv:
            print('  -> N facture manquant')
        if amounts['TTC'] is None:
            print('  -> Montants manquants')
    else:
        ok += 1

print(f'\nOK: {ok} | FAIL: {fail} | Total: {ok+fail}')
