mbox2pdf/mbox2pdf.py

84 lines
3.6 KiB
Python
Raw Normal View History

2025-02-13 16:35:01 +01:00
#!/bin/env python
import mailbox
import codecs
from bs4 import BeautifulSoup
from weasyprint import HTML
from datetime import datetime
import sys
import os
import logging
from slugify import slugify
logging.getLogger('fontTools').setLevel(logging.ERROR)
logging.getLogger('weasyprint').setLevel(logging.ERROR)
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)
def extract_and_convert_email(message, output_file_base, i):
logging.debug(f"Extraction du mail {i}")
for part in message.walk():
if part.get_content_maintype() == 'text' and part.get_content_subtype() == 'html':
charset = part.get_content_charset('windows-1252')
try:
payload = part.get_payload(decode=True).decode(charset, errors='replace')
soup = BeautifulSoup(payload, 'html.parser')
2025-02-13 17:01:17 +01:00
# Nettoyage de l'HTML
attr_whitelist = ("style")
for tag in soup.findAll(True):
for attr in [attr for attr in tag.attrs if attr not in attr_whitelist]:
del tag[attr]
2025-02-13 16:35:01 +01:00
html_content = str(soup)
# Extraction de la date et formatage ISO
date_header = message.get('Date')
from_header = message.get('From')
to_header = message.get('To')
if date_header:
try:
date_object = datetime.strptime(date_header, "%a, %d %b %Y %H:%M:%S %z")
date_iso = date_object.isoformat().replace(":", "-") # Remplace les ":" par "-" pour les noms de fichiers
date_simple = date_object.strftime("%d/%m/%Y, %H:%M:%S")
output_file = os.path.join(output_file_base, f"{date_iso}") # Chemin complet du fichier
except ValueError:
logging.warning(f"Format de date invalide: {date_header}")
output_file = os.path.join(output_file_base, f"email_{i}") # Nom par défaut si date invalide
date_simple = "<i>date inconnue</i>"
2025-02-13 17:05:42 +01:00
html_header = "<div style=\"border: thick double black;\"><ul>"
2025-02-13 16:35:01 +01:00
html_header += f"<li>De : {from_header}</li>"
html_header += f"<li>&#192; : {to_header}</li>"
html_header += f"<li>Date : {date_simple}</li>"
2025-02-13 17:05:42 +01:00
html_header += "</ul><br /></div>"
2025-02-13 16:35:01 +01:00
# Conversion en PDF
try:
html_content = html_header + html_content
HTML(string=html_content).write_pdf(output_file + ".pdf")
logging.info(f"Email converti en PDF : {output_file}.pdf")
except Exception as e:
logging.debug(f"Erreur lors de la conversion PDF : {e}")
except Exception as e:
logging.error(f"Erreur de décodage: {e}")
logging.debug(f"Contenu brut (non décodé): {part.get_payload(decode=False)}")
def process_mbox(mbox_file):
mbox = mailbox.mbox(mbox_file)
logging.info(f"Traitement de {mbox_file}")
# Création du répertoire de sortie
output_dir_name = slugify(os.path.splitext(os.path.basename(mbox_file))[0])
output_dir = os.path.join(os.getcwd(), output_dir_name)
os.makedirs(output_dir, exist_ok=True)
logging.debug(f"Extraction dans le dossier {output_dir}")
for i, message in enumerate(mbox):
extract_and_convert_email(message, output_dir, i)
if __name__ == "__main__":
if len(sys.argv) > 1:
mbox_file = sys.argv[1]
process_mbox(mbox_file)
else:
print("Veuillez fournir le fichier mbox en argument.")