eptm_dashboard/src/parser_bn.py

"""PDF parser for EPTM Bulletins de Notes (BN).

Two PDF variants:
  - DUAL : classes AUTOMAT / DUAL — 1 page per apprenti
            groups: Branches de culture générale (CG) + Branches professionnelles (BP)
  - EM   : classes EM-* — 2 pages per apprenti (page 1 = notes, page 2 = observations)
            groups: Branches professionnelles (BP) + Travaux pratiques (TP)

Extracted rows (per group):
  - Branches individuelles          → branches: [{"nom": str, "notes": [...]}]
  - Moyenne semestrielle du groupe  → moy_sem[0..7]
  - Moyenne annuelle du groupe      → moy_ann[0..7]  (non-null at Sem.2,4,6,8 positions)

Global rows:
  - Moyenne semestrielle globale    → globale.moy_sem[0..7]
  - Moyenne annuelle globale        → globale.moy_ann[0..7]
"""
from __future__ import annotations

import re
from pathlib import Path

import pdfplumber

_RE_CLASSE = re.compile(r"Classe\s+([A-Z][A-Z0-9\-]+(?:\s+\d+)?)")
_RE_YEAR   = re.compile(r"(\d{2}-\d{2})")
_RE_SEM_N  = re.compile(r"Sem\.\s*(\d+)")


# ── Helpers ───────────────────────────────────────────────────────────────────

def _to_float(cell) -> float | None:
    if cell is None:
        return None
    s = str(cell).strip().replace(",", ".")
    try:
        return float(s)
    except ValueError:
        return None


def _extract_name(page) -> tuple[str, str]:
    """Return (nom, prenom) from the top-left address block using word coordinates."""
    try:
        words = page.extract_words()
    except Exception:
        return "", ""

    if not words:
        return "", ""

    page_width = page.width
    # Keep only words on the left half of the page
    left = [w for w in words if w["x0"] < page_width * 0.48]
    if not left:
        return "", ""

    left.sort(key=lambda w: (round(w["top"] / 3) * 3, w["x0"]))

    # Group into lines (words within 4pt vertical distance)
    lines: list[list[dict]] = [[left[0]]]
    for w in left[1:]:
        if abs(w["top"] - lines[-1][-1]["top"]) < 5:
            lines[-1].append(w)
        else:
            lines.append([w])

    address_starts = re.compile(
        r"^(Route|Rue|Chemin|Impasse|Avenue|Allée|Côte|Case|Clos|Place|Cours|\d{4})",
        re.I,
    )
    skip_kw = {"EPTM", "Professionnelle", "Technique", "Département",
               "Service", "Ecole", "École", "formation", "Canton",
               "Kanton", "page", "Sion", "Saint", "BULLETIN", "NOTES"}

    for line_words in lines:
        text = " ".join(w["text"] for w in line_words).strip()
        if not text:
            continue
        if address_starts.match(text):
            continue
        if any(kw in text for kw in skip_kw):
            continue
        if any(c.isdigit() for c in text):
            continue
        parts = text.split(" ", 1)
        if parts[0] and parts[0][0].isupper():
            return parts[0], (parts[1] if len(parts) > 1 else "")

    return "", ""


def _find_bn_table_obj(page):
    """Retourne l'objet Table (avec bbox) correspondant à la table des notes,
    et le contenu extrait sous forme list[list[str]]. Garder l'objet permet
    d'utiliser les bbox de chaque cellule pour aligner les sous-lignes."""
    for tbl in page.find_tables():
        ext = tbl.extract()
        if not ext or len(ext) < 4:
            continue
        header = ext[0]
        if len(header) >= 7 and any(h and "Sem." in str(h) for h in header):
            return tbl, ext
    return None, None


def _cell_lines(page, bbox):
    """Retourne la liste des lignes visuelles dans une cellule, avec leur
    position verticale (top) — sert à aligner branches ↔ notes."""
    if bbox is None:
        return []
    try:
        words = page.crop(bbox).extract_words()
    except Exception:
        return []
    if not words:
        return []
    words.sort(key=lambda w: (w["top"], w["x0"]))
    lines: list[list[dict]] = [[words[0]]]
    for w in words[1:]:
        if abs(w["top"] - lines[-1][-1]["top"]) < 4:
            lines[-1].append(w)
        else:
            lines.append([w])
    out = []
    for ln in lines:
        ln.sort(key=lambda w: w["x0"])
        out.append({
            "top":  sum(w["top"] for w in ln) / len(ln),
            "text": " ".join(w["text"] for w in ln).strip(),
        })
    return out


def _find_bn_table(tables: list) -> list | None:
    """Return the first table that looks like the BN grades table (≥7 cols, Sem. header).

    Kept for backward compatibility — preferred path is _find_bn_table_obj.
    """
    for tbl in tables:
        if not tbl or len(tbl) < 4:
            continue
        header = tbl[0]
        if len(header) >= 7 and any(
            h and "Sem." in str(h) for h in header
        ):
            return tbl
    return None


def _is_notes_page(text: str) -> bool:
    return (
        "BULLETIN DE NOTES" in text
        and (
            "Branches professionnelles" in text
            or "Branches de culture" in text
        )
    )


# ── Core page parser ──────────────────────────────────────────────────────────

def parse_bn_page(page) -> dict | None:
    """Parse one BN page.  Returns None for non-grade pages (observations, cover…)."""
    text = page.extract_text() or ""
    if not _is_notes_page(text):
        return None

    # Extract classe
    cm = _RE_CLASSE.search(text)
    classe = cm.group(1).strip() if cm else ""

    is_em = "Travaux pratiques" in text
    type_classe = "EM" if is_em else "DUAL"

    nom, prenom = _extract_name(page)

    table_obj, bn_table = _find_bn_table_obj(page)
    if not bn_table:
        return None

    # Semester labels from header row (up to 8 semester columns)
    header = bn_table[0]
    sem_labels: list[str | None] = []
    for cell in header[1:9]:  # cols 1-8
        sem_labels.append(str(cell).strip() if cell else None)
    # Pad to 8 if fewer columns found
    while len(sem_labels) < 8:
        sem_labels.append(None)

    table_rows = table_obj.rows  # bbox-aware rows, indexed comme bn_table

    # Parse data rows
    current_group: str | None = None
    groups: dict[str, dict] = {}
    globale: dict[str, list] = {"moy_sem": [None] * 8, "moy_ann": [None] * 8}

    def _empty_group() -> dict:
        return {
            "moy_sem":  [None] * 8,
            "moy_ann":  [None] * 8,
            "branches": [],
        }

    def _branches_from_bbox(table_row) -> list[dict]:
        """Démultiplexe une ligne du tableau en plusieurs branches en utilisant
        la position verticale des mots dans chaque cellule. Indispensable car
        pdfplumber.extract_tables() ne préserve PAS les sous-lignes vides
        (ex: 25 branches dans le label, 7 valeurs visibles dans la colonne
        Sem.1 → l'approche par split('\\n') décale tout)."""
        if table_row is None:
            return []
        cells = table_row.cells
        if not cells or len(cells) < 2 or cells[0] is None:
            return []
        label_lines = _cell_lines(page, cells[0])
        if not label_lines:
            return []
        col_lines: list[list[dict]] = []
        for i in range(8):
            bbox = cells[i + 1] if (i + 1) < len(cells) else None
            col_lines.append(_cell_lines(page, bbox))
        branches = []
        for lab in label_lines:
            notes = []
            for col in col_lines:
                match = None
                for nl in col:
                    if abs(nl["top"] - lab["top"]) < 4:
                        match = _to_float(nl["text"])
                        break
                notes.append(match)
            branches.append({"nom": lab["text"], "notes": notes})
        return branches

    stop = False  # bascule à True après "moyenne annuelle globale" → ignore
                  # les lignes "Absences", "Observations", etc.

    for idx in range(1, len(bn_table)):
        if stop:
            continue
        row = bn_table[idx]
        if not row or not row[0]:
            continue
        table_row = table_rows[idx] if idx < len(table_rows) else None
        label = str(row[0]).strip()
        vals = [
            _to_float(row[i + 1]) if (i + 1) < len(row) else None
            for i in range(8)
        ]

        low = label.lower()

        # Headers de groupe = label avec coefficient "(Nx)" (ex: "Travaux
        # pratiques (1x)"). Indispensable pour distinguer du label de
        # branche homonyme "Travaux pratiques" qui apparaît parfois.
        is_group_header = bool(re.search(r"\(\d+x\)", low))

        if is_group_header and ("branches de culture" in low or "culture g" in low):
            current_group = "CG"
            groups.setdefault("CG", _empty_group())
        elif is_group_header and "branches professionnelles" in low:
            current_group = "BP"
            groups.setdefault("BP", _empty_group())
        elif is_group_header and "travaux pratiques" in low:
            current_group = "TP"
            groups.setdefault("TP", _empty_group())
        elif "moyenne semestrielle du groupe" in low and current_group:
            groups[current_group]["moy_sem"] = vals
        elif "moyenne annuelle du groupe" in low and current_group:
            groups[current_group]["moy_ann"] = vals
        elif "moyenne semestrielle globale" in low:
            globale["moy_sem"] = vals
        elif "moyenne annuelle globale" in low:
            globale["moy_ann"] = vals
            stop = True   # tout ce qui suit (Absences, Observations) est ignoré
        elif current_group is not None:
            # Toute autre ligne dans un groupe = branches individuelles.
            # On utilise la position verticale (bbox) pour aligner branches
            # ↔ notes — voir docstring de _branches_from_bbox.
            groups[current_group]["branches"].extend(
                _branches_from_bbox(table_row)
            )

    if not groups:
        return None

    return {
        "nom": nom,
        "prenom": prenom,
        "classe": classe,
        "type_classe": type_classe,
        "sem_labels": sem_labels,
        "groupes": groups,
        "globale": globale,
    }


# ── Public API ────────────────────────────────────────────────────────────────

def parse_bn_pdf(pdf_path: str | Path) -> dict:
    """Parse a BN PDF (one or two pages per apprenti).

    Returns::

        {
            "classe":     str,
            "type_classe": "EM" | "DUAL",
            "sem_labels": [str|None, ...],   # 8 elements
            "apprentis":  [{
                "nom": str, "prenom": str, "classe": str,
                "type_classe": str,
                "sem_labels": [...],
                "groupes": {
                    "CG"|"BP"|"TP": {"moy_sem": [...], "moy_ann": [...]},
                    ...
                },
                "globale": {"moy_sem": [...], "moy_ann": [...]},
            }, ...]
        }
    """
    apprentis: list[dict] = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            rec = parse_bn_page(page)
            if rec:
                apprentis.append(rec)

    if not apprentis:
        return {"classe": "", "type_classe": "", "sem_labels": [], "apprentis": []}

    return {
        "classe": apprentis[0]["classe"],
        "type_classe": apprentis[0]["type_classe"],
        "sem_labels": apprentis[0]["sem_labels"],
        "apprentis": apprentis,
    }


# ── Label helpers (used by app.py) ────────────────────────────────────────────

def sem_short_label(raw: str | None, idx: int) -> str:
    """'Sem. 1\\n23-24 1' → 'S1'.  Fallback: 'S{idx+1}'."""
    if raw:
        m = _RE_SEM_N.search(str(raw))
        if m:
            return f"S{m.group(1)}"
    return f"S{idx + 1}"


def sem_full_label(raw: str | None, idx: int) -> str:
    """'Sem. 1\\n23-24 1' → 'S1 23-24'.  Fallback: 'S{idx+1}' si pas d'année."""
    short = sem_short_label(raw, idx)
    if raw:
        m = _RE_YEAR.search(str(raw))
        if m:
            return f"{short} {m.group(1)}"
    return short


def sem_year_only(raw: str | None) -> str:
    """Extrait juste l'année '23-24' depuis le label brut, '' si absent."""
    if raw:
        m = _RE_YEAR.search(str(raw))
        if m:
            return m.group(1)
    return ""


def ann_short_label(sem_labels: list[str | None], idx: int) -> str:
    """Return 'Moy.23-24' using the year embedded in the label at *idx*."""
    raw = sem_labels[idx] if idx < len(sem_labels) else None
    if raw:
        m = _RE_YEAR.search(str(raw))
        if m:
            return f"Moy.{m.group(1)}"
    return f"Moy.S{idx + 1}"