Nettoyage du HTML

This commit is contained in:
Antoine Van Elstraete 2025-02-13 17:01:17 +01:00
parent 584a5e82b1
commit 220b102830

View File

@ -22,6 +22,11 @@ def extract_and_convert_email(message, output_file_base, i):
try:
payload = part.get_payload(decode=True).decode(charset, errors='replace')
soup = BeautifulSoup(payload, 'html.parser')
# Nettoyage de l'HTML
attr_whitelist = ("style")
for tag in soup.findAll(True):
for attr in [attr for attr in tag.attrs if attr not in attr_whitelist]:
del tag[attr]
html_content = str(soup)
# Extraction de la date et formatage ISO