#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Script      : download_zonas_pdf.py
Chemin      : /var/www/html/analyses/download_zonas_pdf.py
Description : Télécharge le PDF INTECMAR, tente d'extraire DATA/HORA depuis le PDF,
              puis le stocke dans un dossier au format : yyyymmdd-hhmmss.pdf
              (écrasement si déjà existant).
Usage       : ./venv/bin/python3 download_zonas_pdf.py
              ./venv/bin/python3 download_zonas_pdf.py --url https://.../Zonas_1401.pdf --output-dir /var/www/html/analyses/pdfs
Auteur      : Script autonome pour cron
Date        : 31/03/2026
Version     : 1.0
"""

from __future__ import annotations

import argparse
import io
import re
import sys
from datetime import datetime
from email.utils import parsedate_to_datetime
from pathlib import Path

import requests


DEFAULT_URL = "https://www.intecmar.gal/PDFs/Zonas_1401.pdf"
BASE_DIR = Path(__file__).resolve().parent
DEFAULT_OUTPUT_DIR = BASE_DIR / "pdfs"


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Téléchargement périodique du PDF Zonas_1401")
    parser.add_argument("--url", default=DEFAULT_URL, help="URL du PDF à télécharger")
    parser.add_argument("--output-dir", default=str(DEFAULT_OUTPUT_DIR), help="Dossier de stockage")
    parser.add_argument("--timeout", type=int, default=45, help="Timeout HTTP en secondes")
    return parser.parse_args()


def download_pdf(url: str, timeout: int) -> tuple[bytes, dict]:
    response = requests.get(url, timeout=timeout)
    response.raise_for_status()

    content_type = (response.headers.get("Content-Type") or "").lower()
    if "pdf" not in content_type and not response.content.startswith(b"%PDF"):
        raise RuntimeError(f"Le contenu téléchargé ne semble pas être un PDF (Content-Type={content_type!r})")

    return response.content, dict(response.headers)


def extract_text_with_pdfplumber(pdf_bytes: bytes) -> str:
    try:
        import pdfplumber  # type: ignore
    except Exception:
        return ""

    text_chunks: list[str] = []
    try:
        with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text() or ""
                if page_text.strip():
                    text_chunks.append(page_text)
    except Exception:
        return ""

    return "\n".join(text_chunks)


def extract_data_hora_from_text(text: str) -> tuple[str | None, str | None]:
    if not text:
        return None, None

    compact = re.sub(r"\s+", " ", text)

    date_match = re.search(r"\bDATA\s*:\s*([0-9]{2}/[0-9]{2}/[0-9]{4})\b", compact, flags=re.IGNORECASE)
    hora_match = re.search(r"\bHORA\s*:\s*([0-9]{2}:[0-9]{2}(?::[0-9]{2})?)\b", compact, flags=re.IGNORECASE)

    date_str = date_match.group(1) if date_match else None
    time_str = hora_match.group(1) if hora_match else None
    return date_str, time_str


def extract_from_last_modified(headers: dict) -> tuple[str | None, str | None]:
    raw = headers.get("Last-Modified") or headers.get("last-modified")
    if not raw:
        return None, None

    try:
        dt = parsedate_to_datetime(raw)
        if dt is None:
            return None, None
        if dt.tzinfo is not None:
            dt = dt.astimezone()
        return dt.strftime("%d/%m/%Y"), dt.strftime("%H:%M:%S")
    except Exception:
        return None, None


def build_filename(date_str: str, time_str: str) -> str:
    date_obj = datetime.strptime(date_str, "%d/%m/%Y")

    parts = time_str.split(":")
    if len(parts) == 2:
        hh, mm = parts
        ss = "00"
    elif len(parts) == 3:
        hh, mm, ss = parts
    else:
        raise ValueError(f"Heure invalide: {time_str}")

    hh = hh.zfill(2)
    mm = mm.zfill(2)
    ss = ss.zfill(2)

    return f"{date_obj.strftime('%Y%m%d')}-{hh}{mm}{ss}.pdf"


def main() -> int:
    args = parse_args()
    output_dir = Path(args.output_dir).expanduser().resolve()
    output_dir.mkdir(parents=True, exist_ok=True)

    try:
        pdf_bytes, headers = download_pdf(args.url, args.timeout)
    except Exception as exc:
        print(f"[ERREUR] Téléchargement impossible: {exc}", file=sys.stderr)
        return 1

    source = "pdf_data_hora"

    text = extract_text_with_pdfplumber(pdf_bytes)
    date_str, time_str = extract_data_hora_from_text(text)

    if not (date_str and time_str):
        source = "http_last_modified"
        date_str, time_str = extract_from_last_modified(headers)

    if not (date_str and time_str):
        source = "local_now"
        now = datetime.now()
        date_str = now.strftime("%d/%m/%Y")
        time_str = now.strftime("%H:%M:%S")

    try:
        filename = build_filename(date_str, time_str)
    except Exception as exc:
        print(f"[ERREUR] Impossible de construire le nom de fichier: {exc}", file=sys.stderr)
        return 2

    destination = output_dir / filename

    try:
        destination.write_bytes(pdf_bytes)
    except Exception as exc:
        print(f"[ERREUR] Écriture impossible: {exc}", file=sys.stderr)
        return 3

    print(f"[OK] PDF enregistré: {destination}")
    print(f"[INFO] Source horodatage: {source}")
    print(f"[INFO] DATA={date_str} HORA={time_str}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())