"""Parser for EPTM Matu MP grade lists ('Liste de contrôle des notes MP').""" from __future__ import annotations import re from pathlib import Path import pdfplumber _RE_HDR = re.compile( r"contr.le des notes\s+([\w][\w\-]*\s+[\w][\w\-]*)\s+(\d{2}-\d{2}\s+\d+)", re.I, ) def _to_float(v) -> float | None: if v is None: return None s = str(v).strip().replace(",", ".") try: return float(s) except ValueError: return None def parse_matu_pdf(pdf_path: str | Path) -> dict: """Parse an MP grade control list PDF. Returns:: { "classe_mp": "MP1-TASV 2A", "sem_label": "25-26 2", "apprentis": [ { "nom_complet": str, "moy": float | None, "promotion": str | None, # "B" / "P" / "NB" "prom_info": str | None, # "25-26 1" for NB }, ... ], } """ classe_mp = "" sem_label = "" apprentis: list[dict] = [] with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages: text = page.extract_text() or "" if not classe_mp: m = _RE_HDR.search(text) if m: raw = m.group(1).strip() # Normalize "3-C" → "3C" (some PDFs use a hyphen before the section letter) classe_mp = re.sub(r"(\d)-([A-Za-z])\s*$", r"\1\2", raw) sem_label = re.sub(r"\s+", " ", m.group(2).strip()) for tbl in page.extract_tables(): if not tbl or len(tbl) < 4: continue # Verify it looks like the grades table (col 3 contains numeric values in data rows) found = False for row in tbl[2:5]: if row and row[0] and _to_float(row[3] if len(row) > 3 else None) is not None: found = True break if not found: continue for row in tbl[2:]: # skip 2 header rows if not row or not row[0]: continue name = str(row[0]).strip() if not name or "Moyenne" in name: continue moy = _to_float(row[3] if len(row) > 3 else None) prom_raw = str(row[6]).strip() if len(row) > 6 and row[6] else "" prom_info_raw = str(row[7]).strip() if len(row) > 7 and row[7] else "" apprentis.append({ "nom_complet": name, "moy": moy, "promotion": prom_raw or None, "prom_info": prom_info_raw or None, }) # Fallback : extract class name from filename if PDF text parsing failed if not classe_mp: stem = Path(pdf_path).stem # e.g. "matu_MP1-TASV_3C" if stem.startswith("matu_"): classe_mp = stem[5:].replace("_", " ") # "MP1-TASV 3C" return {"classe_mp": classe_mp, "sem_label": sem_label, "apprentis": apprentis}