2020-01-21 11:38:31 +01:00

76 lines
1.4 KiB
Python

import argparse, os, sys, json, logging
from keywords import rake
from lxml import etree as et
import yake
logging.basicConfig(level=logging.DEBUG)
def index(f):
if not os.path.isfile(f):
logging.error(f + " is not a valid file.")
return None
# filename should be of the type: N.xxxx.xml
#ex: 3.Network.xml
ch = os.path.basename(f).split('.')[0]
indx = {}
r = rake.Rake('keywords/FoxStoplist.txt', max_words_length=3, min_keyword_frequency=1)
y = yake.KeywordExtractor(lan="en", top=40)
root = et.parse(f).getroot()
for m in root.findall('mails/mail'):
nbr_str = m.find('nbr').text
content = m.find('content').text
# format nbr
nbr_str = ch + '.' + nbr_str
# yake
try:
kwy = y.extract_keywords(content)
for k in kwy:
kw = k[0]
if kw not in indx:
indx[kw] = []
indx[kw].append(nbr_str)
except Exception as e:
print(e)
# rake
try:
kwr = r.run(content)
kwr = [x for x in kwr if x[1] > 4.0]
for k in kwr:
kw = k[0]
if kw not in indx:
indx[kw] = []
indx[kw].append(nbr_str)
except Exception as e:
print(e)
return indx
if __name__ == '__main__':
p = argparse.ArgumentParser(description='Builds an index of emails')
p.add_argument('file', metavar="f", help="xml file to index")
args = p.parse_args()
ind = index(args.file)
ind_out = {'selected': {}, 'orphan': ind}
print(json.dumps(ind_out, indent=4, sort_keys=True, ensure_ascii=False))