35 lines
730 B
Python
Raw Permalink Normal View History

2019-12-31 09:53:46 +01:00
import xml.etree.ElementTree as et
import os, logging, glob, hashlib
logging.basicConfig(level=logging.DEBUG)
def check_duplicates(xml_file):
hashes = []
r = et.parse(xml_file).getroot()
for m in r.findall('mails/mail'):
f = m.find('from').text
s = m.find('subject').text
d = m.find('date').text
h = hashlib.sha256((f + s + d).encode("utf-8")).hexdigest()
if h in hashes:
logging.info("* Duplicate: " + s + " - " + f + " - " + d)
# logging.info(h)
else:
hashes.append(h)
if __name__ == "__main__":
d = "out/"
xml_files = [f for f in glob.glob(os.path.join(d, "*.xml"))]
logging.info("Checking duplicates")
for f in xml_files:
logging.info("> " + os.path.basename(f))
check_duplicates(f)