94 lines
3.1 KiB
Python
94 lines
3.1 KiB
Python
"""Parser for EPTM Matu MP grade lists ('Liste de contrôle des notes MP')."""
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from pathlib import Path
|
|
|
|
import pdfplumber
|
|
|
|
_RE_HDR = re.compile(
|
|
r"contr.le des notes\s+([\w][\w\-]*\s+[\w][\w\-]*)\s+(\d{2}-\d{2}\s+\d+)",
|
|
re.I,
|
|
)
|
|
|
|
|
|
def _to_float(v) -> float | None:
|
|
if v is None:
|
|
return None
|
|
s = str(v).strip().replace(",", ".")
|
|
try:
|
|
return float(s)
|
|
except ValueError:
|
|
return None
|
|
|
|
|
|
def parse_matu_pdf(pdf_path: str | Path) -> dict:
|
|
"""Parse an MP grade control list PDF.
|
|
|
|
Returns::
|
|
|
|
{
|
|
"classe_mp": "MP1-TASV 2A",
|
|
"sem_label": "25-26 2",
|
|
"apprentis": [
|
|
{
|
|
"nom_complet": str,
|
|
"moy": float | None,
|
|
"promotion": str | None, # "B" / "P" / "NB"
|
|
"prom_info": str | None, # "25-26 1" for NB
|
|
},
|
|
...
|
|
],
|
|
}
|
|
"""
|
|
classe_mp = ""
|
|
sem_label = ""
|
|
apprentis: list[dict] = []
|
|
|
|
with pdfplumber.open(pdf_path) as pdf:
|
|
for page in pdf.pages:
|
|
text = page.extract_text() or ""
|
|
|
|
if not classe_mp:
|
|
m = _RE_HDR.search(text)
|
|
if m:
|
|
raw = m.group(1).strip()
|
|
# Normalize "3-C" → "3C" (some PDFs use a hyphen before the section letter)
|
|
classe_mp = re.sub(r"(\d)-([A-Za-z])\s*$", r"\1\2", raw)
|
|
sem_label = re.sub(r"\s+", " ", m.group(2).strip())
|
|
|
|
for tbl in page.extract_tables():
|
|
if not tbl or len(tbl) < 4:
|
|
continue
|
|
# Verify it looks like the grades table (col 3 contains numeric values in data rows)
|
|
found = False
|
|
for row in tbl[2:5]:
|
|
if row and row[0] and _to_float(row[3] if len(row) > 3 else None) is not None:
|
|
found = True
|
|
break
|
|
if not found:
|
|
continue
|
|
|
|
for row in tbl[2:]: # skip 2 header rows
|
|
if not row or not row[0]:
|
|
continue
|
|
name = str(row[0]).strip()
|
|
if not name or "Moyenne" in name:
|
|
continue
|
|
moy = _to_float(row[3] if len(row) > 3 else None)
|
|
prom_raw = str(row[6]).strip() if len(row) > 6 and row[6] else ""
|
|
prom_info_raw = str(row[7]).strip() if len(row) > 7 and row[7] else ""
|
|
apprentis.append({
|
|
"nom_complet": name,
|
|
"moy": moy,
|
|
"promotion": prom_raw or None,
|
|
"prom_info": prom_info_raw or None,
|
|
})
|
|
|
|
# Fallback : extract class name from filename if PDF text parsing failed
|
|
if not classe_mp:
|
|
stem = Path(pdf_path).stem # e.g. "matu_MP1-TASV_3C"
|
|
if stem.startswith("matu_"):
|
|
classe_mp = stem[5:].replace("_", " ") # "MP1-TASV 3C"
|
|
|
|
return {"classe_mp": classe_mp, "sem_label": sem_label, "apprentis": apprentis}
|