76 lines
1.4 KiB
Python
76 lines
1.4 KiB
Python
|
|
import argparse, os, sys, json, logging
|
||
|
|
from keywords import rake
|
||
|
|
from lxml import etree as et
|
||
|
|
|
||
|
|
import yake
|
||
|
|
|
||
|
|
logging.basicConfig(level=logging.DEBUG)
|
||
|
|
|
||
|
|
def index(f):
|
||
|
|
|
||
|
|
if not os.path.isfile(f):
|
||
|
|
logging.error(f + " is not a valid file.")
|
||
|
|
return None
|
||
|
|
|
||
|
|
# filename should be of the type: N.xxxx.xml
|
||
|
|
#ex: 3.Network.xml
|
||
|
|
ch = os.path.basename(f).split('.')[0]
|
||
|
|
|
||
|
|
indx = {}
|
||
|
|
|
||
|
|
r = rake.Rake('keywords/FoxStoplist.txt', max_words_length=3, min_keyword_frequency=1)
|
||
|
|
|
||
|
|
y = yake.KeywordExtractor(lan="en", top=40)
|
||
|
|
|
||
|
|
root = et.parse(f).getroot()
|
||
|
|
|
||
|
|
for m in root.findall('mails/mail'):
|
||
|
|
|
||
|
|
nbr_str = m.find('nbr').text
|
||
|
|
content = m.find('content').text
|
||
|
|
|
||
|
|
# format nbr
|
||
|
|
nbr_str = ch + '.' + nbr_str
|
||
|
|
|
||
|
|
# yake
|
||
|
|
try:
|
||
|
|
kwy = y.extract_keywords(content)
|
||
|
|
for k in kwy:
|
||
|
|
kw = k[0]
|
||
|
|
if kw not in indx:
|
||
|
|
indx[kw] = []
|
||
|
|
indx[kw].append(nbr_str)
|
||
|
|
except Exception as e:
|
||
|
|
print(e)
|
||
|
|
|
||
|
|
# rake
|
||
|
|
try:
|
||
|
|
kwr = r.run(content)
|
||
|
|
kwr = [x for x in kwr if x[1] > 4.0]
|
||
|
|
for k in kwr:
|
||
|
|
kw = k[0]
|
||
|
|
if kw not in indx:
|
||
|
|
indx[kw] = []
|
||
|
|
indx[kw].append(nbr_str)
|
||
|
|
except Exception as e:
|
||
|
|
print(e)
|
||
|
|
|
||
|
|
return indx
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == '__main__':
|
||
|
|
|
||
|
|
p = argparse.ArgumentParser(description='Builds an index of emails')
|
||
|
|
p.add_argument('file', metavar="f", help="xml file to index")
|
||
|
|
|
||
|
|
args = p.parse_args()
|
||
|
|
|
||
|
|
ind = index(args.file)
|
||
|
|
ind_out = {'selected': {}, 'orphan': ind}
|
||
|
|
|
||
|
|
print(json.dumps(ind_out, indent=4, sort_keys=True, ensure_ascii=False))
|
||
|
|
|
||
|
|
|
||
|
|
|