eptm_dashboard/src/parser_matu.py

94 lines
3.1 KiB
Python

"""Parser for EPTM Matu MP grade lists ('Liste de contrôle des notes MP')."""
from __future__ import annotations
import re
from pathlib import Path
import pdfplumber
_RE_HDR = re.compile(
r"contr.le des notes\s+([\w][\w\-]*\s+[\w][\w\-]*)\s+(\d{2}-\d{2}\s+\d+)",
re.I,
)
def _to_float(v) -> float | None:
if v is None:
return None
s = str(v).strip().replace(",", ".")
try:
return float(s)
except ValueError:
return None
def parse_matu_pdf(pdf_path: str | Path) -> dict:
"""Parse an MP grade control list PDF.
Returns::
{
"classe_mp": "MP1-TASV 2A",
"sem_label": "25-26 2",
"apprentis": [
{
"nom_complet": str,
"moy": float | None,
"promotion": str | None, # "B" / "P" / "NB"
"prom_info": str | None, # "25-26 1" for NB
},
...
],
}
"""
classe_mp = ""
sem_label = ""
apprentis: list[dict] = []
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
text = page.extract_text() or ""
if not classe_mp:
m = _RE_HDR.search(text)
if m:
raw = m.group(1).strip()
# Normalize "3-C" → "3C" (some PDFs use a hyphen before the section letter)
classe_mp = re.sub(r"(\d)-([A-Za-z])\s*$", r"\1\2", raw)
sem_label = re.sub(r"\s+", " ", m.group(2).strip())
for tbl in page.extract_tables():
if not tbl or len(tbl) < 4:
continue
# Verify it looks like the grades table (col 3 contains numeric values in data rows)
found = False
for row in tbl[2:5]:
if row and row[0] and _to_float(row[3] if len(row) > 3 else None) is not None:
found = True
break
if not found:
continue
for row in tbl[2:]: # skip 2 header rows
if not row or not row[0]:
continue
name = str(row[0]).strip()
if not name or "Moyenne" in name:
continue
moy = _to_float(row[3] if len(row) > 3 else None)
prom_raw = str(row[6]).strip() if len(row) > 6 and row[6] else ""
prom_info_raw = str(row[7]).strip() if len(row) > 7 and row[7] else ""
apprentis.append({
"nom_complet": name,
"moy": moy,
"promotion": prom_raw or None,
"prom_info": prom_info_raw or None,
})
# Fallback : extract class name from filename if PDF text parsing failed
if not classe_mp:
stem = Path(pdf_path).stem # e.g. "matu_MP1-TASV_3C"
if stem.startswith("matu_"):
classe_mp = stem[5:].replace("_", " ") # "MP1-TASV 3C"
return {"classe_mp": classe_mp, "sem_label": sem_label, "apprentis": apprentis}