"""PDF parser for EPTM Bulletins de Notes (BN). Two PDF variants: - DUAL : classes AUTOMAT / DUAL — 1 page per apprenti groups: Branches de culture générale (CG) + Branches professionnelles (BP) - EM : classes EM-* — 2 pages per apprenti (page 1 = notes, page 2 = observations) groups: Branches professionnelles (BP) + Travaux pratiques (TP) Extracted rows (per group): - Branches individuelles → branches: [{"nom": str, "notes": [...]}] - Moyenne semestrielle du groupe → moy_sem[0..7] - Moyenne annuelle du groupe → moy_ann[0..7] (non-null at Sem.2,4,6,8 positions) Global rows: - Moyenne semestrielle globale → globale.moy_sem[0..7] - Moyenne annuelle globale → globale.moy_ann[0..7] """ from __future__ import annotations import re from pathlib import Path import pdfplumber _RE_CLASSE = re.compile(r"Classe\s+([A-Z][A-Z0-9\-]+(?:\s+\d+)?)") _RE_YEAR = re.compile(r"(\d{2}-\d{2})") _RE_SEM_N = re.compile(r"Sem\.\s*(\d+)") # ── Helpers ─────────────────────────────────────────────────────────────────── def _to_float(cell) -> float | None: if cell is None: return None s = str(cell).strip().replace(",", ".") try: return float(s) except ValueError: return None def _extract_name(page) -> tuple[str, str]: """Return (nom, prenom) from the top-left address block using word coordinates.""" try: words = page.extract_words() except Exception: return "", "" if not words: return "", "" page_width = page.width # Keep only words on the left half of the page left = [w for w in words if w["x0"] < page_width * 0.48] if not left: return "", "" left.sort(key=lambda w: (round(w["top"] / 3) * 3, w["x0"])) # Group into lines (words within 4pt vertical distance) lines: list[list[dict]] = [[left[0]]] for w in left[1:]: if abs(w["top"] - lines[-1][-1]["top"]) < 5: lines[-1].append(w) else: lines.append([w]) address_starts = re.compile( r"^(Route|Rue|Chemin|Impasse|Avenue|Allée|Côte|Case|Clos|Place|Cours|\d{4})", re.I, ) skip_kw = {"EPTM", "Professionnelle", "Technique", "Département", "Service", "Ecole", "École", "formation", "Canton", "Kanton", "page", "Sion", "Saint", "BULLETIN", "NOTES"} for line_words in lines: text = " ".join(w["text"] for w in line_words).strip() if not text: continue if address_starts.match(text): continue if any(kw in text for kw in skip_kw): continue if any(c.isdigit() for c in text): continue parts = text.split(" ", 1) if parts[0] and parts[0][0].isupper(): return parts[0], (parts[1] if len(parts) > 1 else "") return "", "" def _find_bn_table_obj(page): """Retourne l'objet Table (avec bbox) correspondant à la table des notes, et le contenu extrait sous forme list[list[str]]. Garder l'objet permet d'utiliser les bbox de chaque cellule pour aligner les sous-lignes.""" for tbl in page.find_tables(): ext = tbl.extract() if not ext or len(ext) < 4: continue header = ext[0] if len(header) >= 7 and any(h and "Sem." in str(h) for h in header): return tbl, ext return None, None def _cell_lines(page, bbox): """Retourne la liste des lignes visuelles dans une cellule, avec leur position verticale (top) — sert à aligner branches ↔ notes.""" if bbox is None: return [] try: words = page.crop(bbox).extract_words() except Exception: return [] if not words: return [] words.sort(key=lambda w: (w["top"], w["x0"])) lines: list[list[dict]] = [[words[0]]] for w in words[1:]: if abs(w["top"] - lines[-1][-1]["top"]) < 4: lines[-1].append(w) else: lines.append([w]) out = [] for ln in lines: ln.sort(key=lambda w: w["x0"]) out.append({ "top": sum(w["top"] for w in ln) / len(ln), "text": " ".join(w["text"] for w in ln).strip(), }) return out def _find_bn_table(tables: list) -> list | None: """Return the first table that looks like the BN grades table (≥7 cols, Sem. header). Kept for backward compatibility — preferred path is _find_bn_table_obj. """ for tbl in tables: if not tbl or len(tbl) < 4: continue header = tbl[0] if len(header) >= 7 and any( h and "Sem." in str(h) for h in header ): return tbl return None def _is_notes_page(text: str) -> bool: return ( "BULLETIN DE NOTES" in text and ( "Branches professionnelles" in text or "Branches de culture" in text ) ) # ── Core page parser ────────────────────────────────────────────────────────── def parse_bn_page(page) -> dict | None: """Parse one BN page. Returns None for non-grade pages (observations, cover…).""" text = page.extract_text() or "" if not _is_notes_page(text): return None # Extract classe cm = _RE_CLASSE.search(text) classe = cm.group(1).strip() if cm else "" is_em = "Travaux pratiques" in text type_classe = "EM" if is_em else "DUAL" nom, prenom = _extract_name(page) table_obj, bn_table = _find_bn_table_obj(page) if not bn_table: return None # Semester labels from header row (up to 8 semester columns) header = bn_table[0] sem_labels: list[str | None] = [] for cell in header[1:9]: # cols 1-8 sem_labels.append(str(cell).strip() if cell else None) # Pad to 8 if fewer columns found while len(sem_labels) < 8: sem_labels.append(None) table_rows = table_obj.rows # bbox-aware rows, indexed comme bn_table # Parse data rows current_group: str | None = None groups: dict[str, dict] = {} globale: dict[str, list] = {"moy_sem": [None] * 8, "moy_ann": [None] * 8} def _empty_group() -> dict: return { "moy_sem": [None] * 8, "moy_ann": [None] * 8, "branches": [], } def _branches_from_bbox(table_row) -> list[dict]: """Démultiplexe une ligne du tableau en plusieurs branches en utilisant la position verticale des mots dans chaque cellule. Indispensable car pdfplumber.extract_tables() ne préserve PAS les sous-lignes vides (ex: 25 branches dans le label, 7 valeurs visibles dans la colonne Sem.1 → l'approche par split('\\n') décale tout).""" if table_row is None: return [] cells = table_row.cells if not cells or len(cells) < 2 or cells[0] is None: return [] label_lines = _cell_lines(page, cells[0]) if not label_lines: return [] col_lines: list[list[dict]] = [] for i in range(8): bbox = cells[i + 1] if (i + 1) < len(cells) else None col_lines.append(_cell_lines(page, bbox)) branches = [] for lab in label_lines: notes = [] for col in col_lines: match = None for nl in col: if abs(nl["top"] - lab["top"]) < 4: match = _to_float(nl["text"]) break notes.append(match) branches.append({"nom": lab["text"], "notes": notes}) return branches stop = False # bascule à True après "moyenne annuelle globale" → ignore # les lignes "Absences", "Observations", etc. for idx in range(1, len(bn_table)): if stop: continue row = bn_table[idx] if not row or not row[0]: continue table_row = table_rows[idx] if idx < len(table_rows) else None label = str(row[0]).strip() vals = [ _to_float(row[i + 1]) if (i + 1) < len(row) else None for i in range(8) ] low = label.lower() # Headers de groupe = label avec coefficient "(Nx)" (ex: "Travaux # pratiques (1x)"). Indispensable pour distinguer du label de # branche homonyme "Travaux pratiques" qui apparaît parfois. is_group_header = bool(re.search(r"\(\d+x\)", low)) if is_group_header and ("branches de culture" in low or "culture g" in low): current_group = "CG" groups.setdefault("CG", _empty_group()) elif is_group_header and "branches professionnelles" in low: current_group = "BP" groups.setdefault("BP", _empty_group()) elif is_group_header and "travaux pratiques" in low: current_group = "TP" groups.setdefault("TP", _empty_group()) elif "moyenne semestrielle du groupe" in low and current_group: groups[current_group]["moy_sem"] = vals elif "moyenne annuelle du groupe" in low and current_group: groups[current_group]["moy_ann"] = vals elif "moyenne semestrielle globale" in low: globale["moy_sem"] = vals elif "moyenne annuelle globale" in low: globale["moy_ann"] = vals stop = True # tout ce qui suit (Absences, Observations) est ignoré elif current_group is not None: # Toute autre ligne dans un groupe = branches individuelles. # On utilise la position verticale (bbox) pour aligner branches # ↔ notes — voir docstring de _branches_from_bbox. groups[current_group]["branches"].extend( _branches_from_bbox(table_row) ) if not groups: return None return { "nom": nom, "prenom": prenom, "classe": classe, "type_classe": type_classe, "sem_labels": sem_labels, "groupes": groups, "globale": globale, } # ── Public API ──────────────────────────────────────────────────────────────── def parse_bn_pdf(pdf_path: str | Path) -> dict: """Parse a BN PDF (one or two pages per apprenti). Returns:: { "classe": str, "type_classe": "EM" | "DUAL", "sem_labels": [str|None, ...], # 8 elements "apprentis": [{ "nom": str, "prenom": str, "classe": str, "type_classe": str, "sem_labels": [...], "groupes": { "CG"|"BP"|"TP": {"moy_sem": [...], "moy_ann": [...]}, ... }, "globale": {"moy_sem": [...], "moy_ann": [...]}, }, ...] } """ apprentis: list[dict] = [] with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages: rec = parse_bn_page(page) if rec: apprentis.append(rec) if not apprentis: return {"classe": "", "type_classe": "", "sem_labels": [], "apprentis": []} return { "classe": apprentis[0]["classe"], "type_classe": apprentis[0]["type_classe"], "sem_labels": apprentis[0]["sem_labels"], "apprentis": apprentis, } # ── Label helpers (used by app.py) ──────────────────────────────────────────── def sem_short_label(raw: str | None, idx: int) -> str: """'Sem. 1\\n23-24 1' → 'S1'. Fallback: 'S{idx+1}'.""" if raw: m = _RE_SEM_N.search(str(raw)) if m: return f"S{m.group(1)}" return f"S{idx + 1}" def sem_full_label(raw: str | None, idx: int) -> str: """'Sem. 1\\n23-24 1' → 'S1 23-24'. Fallback: 'S{idx+1}' si pas d'année.""" short = sem_short_label(raw, idx) if raw: m = _RE_YEAR.search(str(raw)) if m: return f"{short} {m.group(1)}" return short def sem_year_only(raw: str | None) -> str: """Extrait juste l'année '23-24' depuis le label brut, '' si absent.""" if raw: m = _RE_YEAR.search(str(raw)) if m: return m.group(1) return "" def ann_short_label(sem_labels: list[str | None], idx: int) -> str: """Return 'Moy.23-24' using the year embedded in the label at *idx*.""" raw = sem_labels[idx] if idx < len(sem_labels) else None if raw: m = _RE_YEAR.search(str(raw)) if m: return f"Moy.{m.group(1)}" return f"Moy.S{idx + 1}"