2025-02-13 16:35:01 +01:00
|
|
|
#!/bin/env python
|
|
|
|
|
|
|
|
import mailbox
|
|
|
|
import codecs
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
from weasyprint import HTML
|
|
|
|
from datetime import datetime
|
|
|
|
import sys
|
|
|
|
import os
|
|
|
|
import logging
|
|
|
|
from slugify import slugify
|
|
|
|
|
|
|
|
logging.getLogger('fontTools').setLevel(logging.ERROR)
|
|
|
|
logging.getLogger('weasyprint').setLevel(logging.ERROR)
|
|
|
|
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)
|
|
|
|
|
|
|
|
def extract_and_convert_email(message, output_file_base, i):
|
|
|
|
logging.debug(f"Extraction du mail {i}")
|
|
|
|
for part in message.walk():
|
|
|
|
if part.get_content_maintype() == 'text' and part.get_content_subtype() == 'html':
|
|
|
|
charset = part.get_content_charset('windows-1252')
|
|
|
|
try:
|
|
|
|
payload = part.get_payload(decode=True).decode(charset, errors='replace')
|
|
|
|
soup = BeautifulSoup(payload, 'html.parser')
|
2025-02-13 17:01:17 +01:00
|
|
|
# Nettoyage de l'HTML
|
|
|
|
attr_whitelist = ("style")
|
|
|
|
for tag in soup.findAll(True):
|
|
|
|
for attr in [attr for attr in tag.attrs if attr not in attr_whitelist]:
|
|
|
|
del tag[attr]
|
2025-02-13 16:35:01 +01:00
|
|
|
html_content = str(soup)
|
|
|
|
|
|
|
|
# Extraction de la date et formatage ISO
|
|
|
|
date_header = message.get('Date')
|
|
|
|
from_header = message.get('From')
|
|
|
|
to_header = message.get('To')
|
|
|
|
if date_header:
|
|
|
|
try:
|
|
|
|
date_object = datetime.strptime(date_header, "%a, %d %b %Y %H:%M:%S %z")
|
|
|
|
date_iso = date_object.isoformat().replace(":", "-") # Remplace les ":" par "-" pour les noms de fichiers
|
|
|
|
date_simple = date_object.strftime("%d/%m/%Y, %H:%M:%S")
|
|
|
|
output_file = os.path.join(output_file_base, f"{date_iso}") # Chemin complet du fichier
|
|
|
|
except ValueError:
|
|
|
|
logging.warning(f"Format de date invalide: {date_header}")
|
|
|
|
output_file = os.path.join(output_file_base, f"email_{i}") # Nom par défaut si date invalide
|
|
|
|
date_simple = "<i>date inconnue</i>"
|
2025-02-13 17:05:42 +01:00
|
|
|
html_header = "<div style=\"border: thick double black;\"><ul>"
|
2025-02-13 16:35:01 +01:00
|
|
|
html_header += f"<li>De : {from_header}</li>"
|
|
|
|
html_header += f"<li>À : {to_header}</li>"
|
|
|
|
html_header += f"<li>Date : {date_simple}</li>"
|
2025-02-13 17:05:42 +01:00
|
|
|
html_header += "</ul><br /></div>"
|
2025-02-13 16:35:01 +01:00
|
|
|
# Conversion en PDF
|
|
|
|
try:
|
|
|
|
html_content = html_header + html_content
|
|
|
|
HTML(string=html_content).write_pdf(output_file + ".pdf")
|
|
|
|
logging.info(f"Email converti en PDF : {output_file}.pdf")
|
|
|
|
except Exception as e:
|
|
|
|
logging.debug(f"Erreur lors de la conversion PDF : {e}")
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
logging.error(f"Erreur de décodage: {e}")
|
|
|
|
logging.debug(f"Contenu brut (non décodé): {part.get_payload(decode=False)}")
|
|
|
|
|
|
|
|
|
|
|
|
def process_mbox(mbox_file):
|
|
|
|
mbox = mailbox.mbox(mbox_file)
|
|
|
|
logging.info(f"Traitement de {mbox_file}")
|
|
|
|
|
|
|
|
# Création du répertoire de sortie
|
|
|
|
output_dir_name = slugify(os.path.splitext(os.path.basename(mbox_file))[0])
|
|
|
|
output_dir = os.path.join(os.getcwd(), output_dir_name)
|
|
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
logging.debug(f"Extraction dans le dossier {output_dir}")
|
|
|
|
|
|
|
|
for i, message in enumerate(mbox):
|
|
|
|
extract_and_convert_email(message, output_dir, i)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
if len(sys.argv) > 1:
|
|
|
|
mbox_file = sys.argv[1]
|
|
|
|
process_mbox(mbox_file)
|
|
|
|
else:
|
|
|
|
print("Veuillez fournir le fichier mbox en argument.")
|