From 584a5e82b1512dd29ea69045fe39b9eed683a947 Mon Sep 17 00:00:00 2001 From: Antoine Van Elstraete Date: Thu, 13 Feb 2025 16:35:01 +0100 Subject: [PATCH] Premier jet --- mbox2pdf.py | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100755 mbox2pdf.py diff --git a/mbox2pdf.py b/mbox2pdf.py new file mode 100755 index 0000000..e8d7384 --- /dev/null +++ b/mbox2pdf.py @@ -0,0 +1,78 @@ +#!/bin/env python + +import mailbox +import codecs +from bs4 import BeautifulSoup +from weasyprint import HTML +from datetime import datetime +import sys +import os +import logging +from slugify import slugify + +logging.getLogger('fontTools').setLevel(logging.ERROR) +logging.getLogger('weasyprint').setLevel(logging.ERROR) +logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR) + +def extract_and_convert_email(message, output_file_base, i): + logging.debug(f"Extraction du mail {i}") + for part in message.walk(): + if part.get_content_maintype() == 'text' and part.get_content_subtype() == 'html': + charset = part.get_content_charset('windows-1252') + try: + payload = part.get_payload(decode=True).decode(charset, errors='replace') + soup = BeautifulSoup(payload, 'html.parser') + html_content = str(soup) + + # Extraction de la date et formatage ISO + date_header = message.get('Date') + from_header = message.get('From') + to_header = message.get('To') + if date_header: + try: + date_object = datetime.strptime(date_header, "%a, %d %b %Y %H:%M:%S %z") + date_iso = date_object.isoformat().replace(":", "-") # Remplace les ":" par "-" pour les noms de fichiers + date_simple = date_object.strftime("%d/%m/%Y, %H:%M:%S") + output_file = os.path.join(output_file_base, f"{date_iso}") # Chemin complet du fichier + except ValueError: + logging.warning(f"Format de date invalide: {date_header}") + output_file = os.path.join(output_file_base, f"email_{i}") # Nom par défaut si date invalide + date_simple = "date inconnue" + html_header = "


" + # Conversion en PDF + try: + html_content = html_header + html_content + HTML(string=html_content).write_pdf(output_file + ".pdf") + logging.info(f"Email converti en PDF : {output_file}.pdf") + except Exception as e: + logging.debug(f"Erreur lors de la conversion PDF : {e}") + + except Exception as e: + logging.error(f"Erreur de décodage: {e}") + logging.debug(f"Contenu brut (non décodé): {part.get_payload(decode=False)}") + + +def process_mbox(mbox_file): + mbox = mailbox.mbox(mbox_file) + logging.info(f"Traitement de {mbox_file}") + + # Création du répertoire de sortie + output_dir_name = slugify(os.path.splitext(os.path.basename(mbox_file))[0]) + output_dir = os.path.join(os.getcwd(), output_dir_name) + os.makedirs(output_dir, exist_ok=True) + logging.debug(f"Extraction dans le dossier {output_dir}") + + for i, message in enumerate(mbox): + extract_and_convert_email(message, output_dir, i) + + +if __name__ == "__main__": + if len(sys.argv) > 1: + mbox_file = sys.argv[1] + process_mbox(mbox_file) + else: + print("Veuillez fournir le fichier mbox en argument.")