Premier jet
This commit is contained in:
parent
fd4030aa8d
commit
584a5e82b1
78
mbox2pdf.py
Executable file
78
mbox2pdf.py
Executable file
@ -0,0 +1,78 @@
|
||||
#!/bin/env python
|
||||
|
||||
import mailbox
|
||||
import codecs
|
||||
from bs4 import BeautifulSoup
|
||||
from weasyprint import HTML
|
||||
from datetime import datetime
|
||||
import sys
|
||||
import os
|
||||
import logging
|
||||
from slugify import slugify
|
||||
|
||||
logging.getLogger('fontTools').setLevel(logging.ERROR)
|
||||
logging.getLogger('weasyprint').setLevel(logging.ERROR)
|
||||
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)
|
||||
|
||||
def extract_and_convert_email(message, output_file_base, i):
|
||||
logging.debug(f"Extraction du mail {i}")
|
||||
for part in message.walk():
|
||||
if part.get_content_maintype() == 'text' and part.get_content_subtype() == 'html':
|
||||
charset = part.get_content_charset('windows-1252')
|
||||
try:
|
||||
payload = part.get_payload(decode=True).decode(charset, errors='replace')
|
||||
soup = BeautifulSoup(payload, 'html.parser')
|
||||
html_content = str(soup)
|
||||
|
||||
# Extraction de la date et formatage ISO
|
||||
date_header = message.get('Date')
|
||||
from_header = message.get('From')
|
||||
to_header = message.get('To')
|
||||
if date_header:
|
||||
try:
|
||||
date_object = datetime.strptime(date_header, "%a, %d %b %Y %H:%M:%S %z")
|
||||
date_iso = date_object.isoformat().replace(":", "-") # Remplace les ":" par "-" pour les noms de fichiers
|
||||
date_simple = date_object.strftime("%d/%m/%Y, %H:%M:%S")
|
||||
output_file = os.path.join(output_file_base, f"{date_iso}") # Chemin complet du fichier
|
||||
except ValueError:
|
||||
logging.warning(f"Format de date invalide: {date_header}")
|
||||
output_file = os.path.join(output_file_base, f"email_{i}") # Nom par défaut si date invalide
|
||||
date_simple = "<i>date inconnue</i>"
|
||||
html_header = "<p><ul>"
|
||||
html_header += f"<li>De : {from_header}</li>"
|
||||
html_header += f"<li>À : {to_header}</li>"
|
||||
html_header += f"<li>Date : {date_simple}</li>"
|
||||
html_header += "</ul><br /></p>"
|
||||
# Conversion en PDF
|
||||
try:
|
||||
html_content = html_header + html_content
|
||||
HTML(string=html_content).write_pdf(output_file + ".pdf")
|
||||
logging.info(f"Email converti en PDF : {output_file}.pdf")
|
||||
except Exception as e:
|
||||
logging.debug(f"Erreur lors de la conversion PDF : {e}")
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Erreur de décodage: {e}")
|
||||
logging.debug(f"Contenu brut (non décodé): {part.get_payload(decode=False)}")
|
||||
|
||||
|
||||
def process_mbox(mbox_file):
|
||||
mbox = mailbox.mbox(mbox_file)
|
||||
logging.info(f"Traitement de {mbox_file}")
|
||||
|
||||
# Création du répertoire de sortie
|
||||
output_dir_name = slugify(os.path.splitext(os.path.basename(mbox_file))[0])
|
||||
output_dir = os.path.join(os.getcwd(), output_dir_name)
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
logging.debug(f"Extraction dans le dossier {output_dir}")
|
||||
|
||||
for i, message in enumerate(mbox):
|
||||
extract_and_convert_email(message, output_dir, i)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) > 1:
|
||||
mbox_file = sys.argv[1]
|
||||
process_mbox(mbox_file)
|
||||
else:
|
||||
print("Veuillez fournir le fichier mbox en argument.")
|
Loading…
x
Reference in New Issue
Block a user