import xml.etree.ElementTree as et import os, logging, glob, hashlib logging.basicConfig(level=logging.DEBUG) def check_duplicates(xml_file): hashes = [] r = et.parse(xml_file).getroot() for m in r.findall('mails/mail'): f = m.find('from').text s = m.find('subject').text d = m.find('date').text h = hashlib.sha256((f + s + d).encode("utf-8")).hexdigest() if h in hashes: logging.info("* Duplicate: " + s + " - " + f + " - " + d) # logging.info(h) else: hashes.append(h) if __name__ == "__main__": d = "out/" xml_files = [f for f in glob.glob(os.path.join(d, "*.xml"))] logging.info("Checking duplicates") for f in xml_files: logging.info("> " + os.path.basename(f)) check_duplicates(f)