154 lines
3.8 KiB
Python
Raw Normal View History

2020-01-25 10:57:13 +01:00
import argparse, os, glob, json, logging, collections
import db.utils, db.listservs
from lxml import etree as et
import mysql.connector as mariadb
import config
logging.basicConfig(level=logging.DEBUG)
def list_all(d, ext):
if not os.path.isdir(d):
logging.error(d + " is not a valid directory.")
return None
return [f for f in glob.glob(os.path.join(d, "*." + ext))]
def index(f, indx=None):
if not os.path.isfile(f):
logging.error(f + " is not a valid file.")
return None
conf = config.listservs_db
db_con = db.utils.connect_db(conf['database'], conf['host'], conf['user'], conf['password'])
if db_con is None:
logging.error("Can not connect to db " + conf['database'] + " @ " + conf['host'])
return None
tables = db.utils.list_all_tables(db_con)
if tables is None:
logging.error("There are no table in db " + conf['database'] + " @ " + conf['host'])
db_con.close()
return None
# filename should be of the type: N.xxxx.xml
#ex: 3.Network.xml
ch = os.path.basename(f).split('.')[0]
if indx is None:
indx = {}
root = et.parse(f).getroot()
to_table_map = {
'<new-media-curating@jiscmail.ac.uk>': 'crumb',
'spectre@mikrolisten.de': 'spectre',
'<empyre@lists.cofa.unsw.edu.au>': 'empyre',
'nettime-bold@nettime.org': 'nettime_bold',
'nettime-l@desk.nl': 'nettime_l',
'nettime-l@desk.nl': 'nettime_l',
'mettime-l-temp@material.net': 'nettime_l',
'nettime-l@bbs.thing.net': 'nettime_l',
'nettime-l@kein.org': 'nettime_l',
'oldboys@lists.ccc.de': 'oldboys',
'n/a': 'syndicate'
}
try:
logging.info("-----------------")
logging.info(os.path.basename(f))
logging.info("-----------------")
for m in root.findall('mails/mail'):
nbr_str = m.find('nbr').text
to_str = m.find('to').text
date_str = m.find('date').text
from_str = m.find('from').text
# format nbr
nbr_str = ch + '.' + nbr_str
if nbr_str in indx:
logging.warning(nbr_str + " is already indexed... skipping")
continue
table = to_table_map[to_str]
logging.info(nbr_str + " - [" + table + "] - " + date_str + " - " + from_str)
# db_con, date_str, from_str, db_str
urls = db.listservs.query_url(db_con, date_str, from_str, table)
if urls is None or len(urls) == 0:
logging.warning("No url for " + nbr_str)
continue
if len(urls) > 1:
logging.warning("More than one url for " + nbr_str + "... taking first...")
indx[nbr_str] = urls[0]
return indx
except Exception as e:
print("aaaaa")
raise e
finally:
db_con.close()
def parse_nbr(nbr_str):
if '-' in nbr_str:
nbr_str = nbr_str.split('-')[0]
return tuple([int(j) for j in nbr_str.split('.')])
def save(fn, ind):
logging.info("savig work")
with open(fn, 'w') as fp:
# sort keys
ind = collections.OrderedDict(sorted(ind.items(), key=lambda x: parse_nbr(x[0])))
json.dump(ind, fp, indent=4, ensure_ascii=False)
logging.info("done.")
if __name__ == '__main__':
p = argparse.ArgumentParser(description='Extract urls for mails in xml file')
p.add_argument('file', metavar="f", help="xml file to extract urls for")
ind = {}
args = p.parse_args()
if args.file == "all":
try:
urls = os.path.join(config.index['path'], config.index['urls'])
with open(urls) as fp:
ind = json.load(fp)
xml_files = list_all(config.xml['path'], 'xml')
for x in xml_files:
if index(x, indx=ind) is None:
logging.error("Error processing - " + x)
save(urls, ind)
except KeyboardInterrupt:
save(urls, ind)
# logging.info("savig work")
# with open(urls, 'w') as fp:
# # sort keys
# ind = collections.OrderedDict(sorted(ind.items(), key=lambda x: tuple([int(j) for j in x[0].split('.')])))
# json.dump(ind, fp, indent=4, ensure_ascii=False)
# logging.info("done.")
else:
ind = index(args.file)
print(json.dumps(ind, indent=4, sort_keys=True, ensure_ascii=False))