35 lines
730 B
Python
35 lines
730 B
Python
import xml.etree.ElementTree as et
|
|
import os, logging, glob, hashlib
|
|
|
|
logging.basicConfig(level=logging.DEBUG)
|
|
|
|
def check_duplicates(xml_file):
|
|
|
|
hashes = []
|
|
r = et.parse(xml_file).getroot()
|
|
|
|
for m in r.findall('mails/mail'):
|
|
f = m.find('from').text
|
|
s = m.find('subject').text
|
|
d = m.find('date').text
|
|
h = hashlib.sha256((f + s + d).encode("utf-8")).hexdigest()
|
|
|
|
if h in hashes:
|
|
logging.info("* Duplicate: " + s + " - " + f + " - " + d)
|
|
# logging.info(h)
|
|
else:
|
|
hashes.append(h)
|
|
|
|
if __name__ == "__main__":
|
|
|
|
d = "out/"
|
|
xml_files = [f for f in glob.glob(os.path.join(d, "*.xml"))]
|
|
|
|
logging.info("Checking duplicates")
|
|
|
|
for f in xml_files:
|
|
logging.info("> " + os.path.basename(f))
|
|
check_duplicates(f)
|
|
|
|
|