377 lines
13 KiB
Python
377 lines
13 KiB
Python
"""PDF parser for EPTM Bulletins de Notes (BN).
|
|
|
|
Two PDF variants:
|
|
- DUAL : classes AUTOMAT / DUAL — 1 page per apprenti
|
|
groups: Branches de culture générale (CG) + Branches professionnelles (BP)
|
|
- EM : classes EM-* — 2 pages per apprenti (page 1 = notes, page 2 = observations)
|
|
groups: Branches professionnelles (BP) + Travaux pratiques (TP)
|
|
|
|
Extracted rows (per group):
|
|
- Branches individuelles → branches: [{"nom": str, "notes": [...]}]
|
|
- Moyenne semestrielle du groupe → moy_sem[0..7]
|
|
- Moyenne annuelle du groupe → moy_ann[0..7] (non-null at Sem.2,4,6,8 positions)
|
|
|
|
Global rows:
|
|
- Moyenne semestrielle globale → globale.moy_sem[0..7]
|
|
- Moyenne annuelle globale → globale.moy_ann[0..7]
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from pathlib import Path
|
|
|
|
import pdfplumber
|
|
|
|
_RE_CLASSE = re.compile(r"Classe\s+([A-Z][A-Z0-9\-]+(?:\s+\d+)?)")
|
|
_RE_YEAR = re.compile(r"(\d{2}-\d{2})")
|
|
_RE_SEM_N = re.compile(r"Sem\.\s*(\d+)")
|
|
|
|
|
|
# ── Helpers ───────────────────────────────────────────────────────────────────
|
|
|
|
def _to_float(cell) -> float | None:
|
|
if cell is None:
|
|
return None
|
|
s = str(cell).strip().replace(",", ".")
|
|
try:
|
|
return float(s)
|
|
except ValueError:
|
|
return None
|
|
|
|
|
|
def _extract_name(page) -> tuple[str, str]:
|
|
"""Return (nom, prenom) from the top-left address block using word coordinates."""
|
|
try:
|
|
words = page.extract_words()
|
|
except Exception:
|
|
return "", ""
|
|
|
|
if not words:
|
|
return "", ""
|
|
|
|
page_width = page.width
|
|
# Keep only words on the left half of the page
|
|
left = [w for w in words if w["x0"] < page_width * 0.48]
|
|
if not left:
|
|
return "", ""
|
|
|
|
left.sort(key=lambda w: (round(w["top"] / 3) * 3, w["x0"]))
|
|
|
|
# Group into lines (words within 4pt vertical distance)
|
|
lines: list[list[dict]] = [[left[0]]]
|
|
for w in left[1:]:
|
|
if abs(w["top"] - lines[-1][-1]["top"]) < 5:
|
|
lines[-1].append(w)
|
|
else:
|
|
lines.append([w])
|
|
|
|
address_starts = re.compile(
|
|
r"^(Route|Rue|Chemin|Impasse|Avenue|Allée|Côte|Case|Clos|Place|Cours|\d{4})",
|
|
re.I,
|
|
)
|
|
skip_kw = {"EPTM", "Professionnelle", "Technique", "Département",
|
|
"Service", "Ecole", "École", "formation", "Canton",
|
|
"Kanton", "page", "Sion", "Saint", "BULLETIN", "NOTES"}
|
|
|
|
for line_words in lines:
|
|
text = " ".join(w["text"] for w in line_words).strip()
|
|
if not text:
|
|
continue
|
|
if address_starts.match(text):
|
|
continue
|
|
if any(kw in text for kw in skip_kw):
|
|
continue
|
|
if any(c.isdigit() for c in text):
|
|
continue
|
|
parts = text.split(" ", 1)
|
|
if parts[0] and parts[0][0].isupper():
|
|
return parts[0], (parts[1] if len(parts) > 1 else "")
|
|
|
|
return "", ""
|
|
|
|
|
|
def _find_bn_table_obj(page):
|
|
"""Retourne l'objet Table (avec bbox) correspondant à la table des notes,
|
|
et le contenu extrait sous forme list[list[str]]. Garder l'objet permet
|
|
d'utiliser les bbox de chaque cellule pour aligner les sous-lignes."""
|
|
for tbl in page.find_tables():
|
|
ext = tbl.extract()
|
|
if not ext or len(ext) < 4:
|
|
continue
|
|
header = ext[0]
|
|
if len(header) >= 7 and any(h and "Sem." in str(h) for h in header):
|
|
return tbl, ext
|
|
return None, None
|
|
|
|
|
|
def _cell_lines(page, bbox):
|
|
"""Retourne la liste des lignes visuelles dans une cellule, avec leur
|
|
position verticale (top) — sert à aligner branches ↔ notes."""
|
|
if bbox is None:
|
|
return []
|
|
try:
|
|
words = page.crop(bbox).extract_words()
|
|
except Exception:
|
|
return []
|
|
if not words:
|
|
return []
|
|
words.sort(key=lambda w: (w["top"], w["x0"]))
|
|
lines: list[list[dict]] = [[words[0]]]
|
|
for w in words[1:]:
|
|
if abs(w["top"] - lines[-1][-1]["top"]) < 4:
|
|
lines[-1].append(w)
|
|
else:
|
|
lines.append([w])
|
|
out = []
|
|
for ln in lines:
|
|
ln.sort(key=lambda w: w["x0"])
|
|
out.append({
|
|
"top": sum(w["top"] for w in ln) / len(ln),
|
|
"text": " ".join(w["text"] for w in ln).strip(),
|
|
})
|
|
return out
|
|
|
|
|
|
def _find_bn_table(tables: list) -> list | None:
|
|
"""Return the first table that looks like the BN grades table (≥7 cols, Sem. header).
|
|
|
|
Kept for backward compatibility — preferred path is _find_bn_table_obj.
|
|
"""
|
|
for tbl in tables:
|
|
if not tbl or len(tbl) < 4:
|
|
continue
|
|
header = tbl[0]
|
|
if len(header) >= 7 and any(
|
|
h and "Sem." in str(h) for h in header
|
|
):
|
|
return tbl
|
|
return None
|
|
|
|
|
|
def _is_notes_page(text: str) -> bool:
|
|
return (
|
|
"BULLETIN DE NOTES" in text
|
|
and (
|
|
"Branches professionnelles" in text
|
|
or "Branches de culture" in text
|
|
)
|
|
)
|
|
|
|
|
|
# ── Core page parser ──────────────────────────────────────────────────────────
|
|
|
|
def parse_bn_page(page) -> dict | None:
|
|
"""Parse one BN page. Returns None for non-grade pages (observations, cover…)."""
|
|
text = page.extract_text() or ""
|
|
if not _is_notes_page(text):
|
|
return None
|
|
|
|
# Extract classe
|
|
cm = _RE_CLASSE.search(text)
|
|
classe = cm.group(1).strip() if cm else ""
|
|
|
|
is_em = "Travaux pratiques" in text
|
|
type_classe = "EM" if is_em else "DUAL"
|
|
|
|
nom, prenom = _extract_name(page)
|
|
|
|
table_obj, bn_table = _find_bn_table_obj(page)
|
|
if not bn_table:
|
|
return None
|
|
|
|
# Semester labels from header row (up to 8 semester columns)
|
|
header = bn_table[0]
|
|
sem_labels: list[str | None] = []
|
|
for cell in header[1:9]: # cols 1-8
|
|
sem_labels.append(str(cell).strip() if cell else None)
|
|
# Pad to 8 if fewer columns found
|
|
while len(sem_labels) < 8:
|
|
sem_labels.append(None)
|
|
|
|
table_rows = table_obj.rows # bbox-aware rows, indexed comme bn_table
|
|
|
|
# Parse data rows
|
|
current_group: str | None = None
|
|
groups: dict[str, dict] = {}
|
|
globale: dict[str, list] = {"moy_sem": [None] * 8, "moy_ann": [None] * 8}
|
|
|
|
def _empty_group() -> dict:
|
|
return {
|
|
"moy_sem": [None] * 8,
|
|
"moy_ann": [None] * 8,
|
|
"branches": [],
|
|
}
|
|
|
|
def _branches_from_bbox(table_row) -> list[dict]:
|
|
"""Démultiplexe une ligne du tableau en plusieurs branches en utilisant
|
|
la position verticale des mots dans chaque cellule. Indispensable car
|
|
pdfplumber.extract_tables() ne préserve PAS les sous-lignes vides
|
|
(ex: 25 branches dans le label, 7 valeurs visibles dans la colonne
|
|
Sem.1 → l'approche par split('\\n') décale tout)."""
|
|
if table_row is None:
|
|
return []
|
|
cells = table_row.cells
|
|
if not cells or len(cells) < 2 or cells[0] is None:
|
|
return []
|
|
label_lines = _cell_lines(page, cells[0])
|
|
if not label_lines:
|
|
return []
|
|
col_lines: list[list[dict]] = []
|
|
for i in range(8):
|
|
bbox = cells[i + 1] if (i + 1) < len(cells) else None
|
|
col_lines.append(_cell_lines(page, bbox))
|
|
branches = []
|
|
for lab in label_lines:
|
|
notes = []
|
|
for col in col_lines:
|
|
match = None
|
|
for nl in col:
|
|
if abs(nl["top"] - lab["top"]) < 4:
|
|
match = _to_float(nl["text"])
|
|
break
|
|
notes.append(match)
|
|
branches.append({"nom": lab["text"], "notes": notes})
|
|
return branches
|
|
|
|
stop = False # bascule à True après "moyenne annuelle globale" → ignore
|
|
# les lignes "Absences", "Observations", etc.
|
|
|
|
for idx in range(1, len(bn_table)):
|
|
if stop:
|
|
continue
|
|
row = bn_table[idx]
|
|
if not row or not row[0]:
|
|
continue
|
|
table_row = table_rows[idx] if idx < len(table_rows) else None
|
|
label = str(row[0]).strip()
|
|
vals = [
|
|
_to_float(row[i + 1]) if (i + 1) < len(row) else None
|
|
for i in range(8)
|
|
]
|
|
|
|
low = label.lower()
|
|
|
|
# Headers de groupe = label avec coefficient "(Nx)" (ex: "Travaux
|
|
# pratiques (1x)"). Indispensable pour distinguer du label de
|
|
# branche homonyme "Travaux pratiques" qui apparaît parfois.
|
|
is_group_header = bool(re.search(r"\(\d+x\)", low))
|
|
|
|
if is_group_header and ("branches de culture" in low or "culture g" in low):
|
|
current_group = "CG"
|
|
groups.setdefault("CG", _empty_group())
|
|
elif is_group_header and "branches professionnelles" in low:
|
|
current_group = "BP"
|
|
groups.setdefault("BP", _empty_group())
|
|
elif is_group_header and "travaux pratiques" in low:
|
|
current_group = "TP"
|
|
groups.setdefault("TP", _empty_group())
|
|
elif "moyenne semestrielle du groupe" in low and current_group:
|
|
groups[current_group]["moy_sem"] = vals
|
|
elif "moyenne annuelle du groupe" in low and current_group:
|
|
groups[current_group]["moy_ann"] = vals
|
|
elif "moyenne semestrielle globale" in low:
|
|
globale["moy_sem"] = vals
|
|
elif "moyenne annuelle globale" in low:
|
|
globale["moy_ann"] = vals
|
|
stop = True # tout ce qui suit (Absences, Observations) est ignoré
|
|
elif current_group is not None:
|
|
# Toute autre ligne dans un groupe = branches individuelles.
|
|
# On utilise la position verticale (bbox) pour aligner branches
|
|
# ↔ notes — voir docstring de _branches_from_bbox.
|
|
groups[current_group]["branches"].extend(
|
|
_branches_from_bbox(table_row)
|
|
)
|
|
|
|
if not groups:
|
|
return None
|
|
|
|
return {
|
|
"nom": nom,
|
|
"prenom": prenom,
|
|
"classe": classe,
|
|
"type_classe": type_classe,
|
|
"sem_labels": sem_labels,
|
|
"groupes": groups,
|
|
"globale": globale,
|
|
}
|
|
|
|
|
|
# ── Public API ────────────────────────────────────────────────────────────────
|
|
|
|
def parse_bn_pdf(pdf_path: str | Path) -> dict:
|
|
"""Parse a BN PDF (one or two pages per apprenti).
|
|
|
|
Returns::
|
|
|
|
{
|
|
"classe": str,
|
|
"type_classe": "EM" | "DUAL",
|
|
"sem_labels": [str|None, ...], # 8 elements
|
|
"apprentis": [{
|
|
"nom": str, "prenom": str, "classe": str,
|
|
"type_classe": str,
|
|
"sem_labels": [...],
|
|
"groupes": {
|
|
"CG"|"BP"|"TP": {"moy_sem": [...], "moy_ann": [...]},
|
|
...
|
|
},
|
|
"globale": {"moy_sem": [...], "moy_ann": [...]},
|
|
}, ...]
|
|
}
|
|
"""
|
|
apprentis: list[dict] = []
|
|
with pdfplumber.open(pdf_path) as pdf:
|
|
for page in pdf.pages:
|
|
rec = parse_bn_page(page)
|
|
if rec:
|
|
apprentis.append(rec)
|
|
|
|
if not apprentis:
|
|
return {"classe": "", "type_classe": "", "sem_labels": [], "apprentis": []}
|
|
|
|
return {
|
|
"classe": apprentis[0]["classe"],
|
|
"type_classe": apprentis[0]["type_classe"],
|
|
"sem_labels": apprentis[0]["sem_labels"],
|
|
"apprentis": apprentis,
|
|
}
|
|
|
|
|
|
# ── Label helpers (used by app.py) ────────────────────────────────────────────
|
|
|
|
def sem_short_label(raw: str | None, idx: int) -> str:
|
|
"""'Sem. 1\\n23-24 1' → 'S1'. Fallback: 'S{idx+1}'."""
|
|
if raw:
|
|
m = _RE_SEM_N.search(str(raw))
|
|
if m:
|
|
return f"S{m.group(1)}"
|
|
return f"S{idx + 1}"
|
|
|
|
|
|
def sem_full_label(raw: str | None, idx: int) -> str:
|
|
"""'Sem. 1\\n23-24 1' → 'S1 23-24'. Fallback: 'S{idx+1}' si pas d'année."""
|
|
short = sem_short_label(raw, idx)
|
|
if raw:
|
|
m = _RE_YEAR.search(str(raw))
|
|
if m:
|
|
return f"{short} {m.group(1)}"
|
|
return short
|
|
|
|
|
|
def sem_year_only(raw: str | None) -> str:
|
|
"""Extrait juste l'année '23-24' depuis le label brut, '' si absent."""
|
|
if raw:
|
|
m = _RE_YEAR.search(str(raw))
|
|
if m:
|
|
return m.group(1)
|
|
return ""
|
|
|
|
|
|
def ann_short_label(sem_labels: list[str | None], idx: int) -> str:
|
|
"""Return 'Moy.23-24' using the year embedded in the label at *idx*."""
|
|
raw = sem_labels[idx] if idx < len(sem_labels) else None
|
|
if raw:
|
|
m = _RE_YEAR.search(str(raw))
|
|
if m:
|
|
return f"Moy.{m.group(1)}"
|
|
return f"Moy.S{idx + 1}"
|