import argparse, os, sys, json, logging from keywords import rake from lxml import etree as et import yake logging.basicConfig(level=logging.DEBUG) def index(f): if not os.path.isfile(f): logging.error(f + " is not a valid file.") return None # filename should be of the type: N.xxxx.xml #ex: 3.Network.xml ch = os.path.basename(f).split('.')[0] indx = {} r = rake.Rake('keywords/FoxStoplist.txt', max_words_length=3, min_keyword_frequency=1) y = yake.KeywordExtractor(lan="en", top=40) root = et.parse(f).getroot() for m in root.findall('mails/mail'): nbr_str = m.find('nbr').text content = m.find('content').text # format nbr nbr_str = ch + '.' + nbr_str # yake try: kwy = y.extract_keywords(content) for k in kwy: kw = k[0] if kw not in indx: indx[kw] = [] indx[kw].append(nbr_str) except Exception as e: print(e) # rake try: kwr = r.run(content) kwr = [x for x in kwr if x[1] > 4.0] for k in kwr: kw = k[0] if kw not in indx: indx[kw] = [] indx[kw].append(nbr_str) except Exception as e: print(e) return indx if __name__ == '__main__': p = argparse.ArgumentParser(description='Builds an index of emails') p.add_argument('file', metavar="f", help="xml file to index") args = p.parse_args() ind = index(args.file) ind_out = {'selected': {}, 'orphan': ind} print(json.dumps(ind_out, indent=4, sort_keys=True, ensure_ascii=False))