154 lines
3.8 KiB
Python
154 lines
3.8 KiB
Python
|
|
import argparse, os, glob, json, logging, collections
|
||
|
|
import db.utils, db.listservs
|
||
|
|
from lxml import etree as et
|
||
|
|
import mysql.connector as mariadb
|
||
|
|
import config
|
||
|
|
|
||
|
|
logging.basicConfig(level=logging.DEBUG)
|
||
|
|
|
||
|
|
def list_all(d, ext):
|
||
|
|
|
||
|
|
if not os.path.isdir(d):
|
||
|
|
logging.error(d + " is not a valid directory.")
|
||
|
|
return None
|
||
|
|
|
||
|
|
return [f for f in glob.glob(os.path.join(d, "*." + ext))]
|
||
|
|
|
||
|
|
|
||
|
|
def index(f, indx=None):
|
||
|
|
|
||
|
|
if not os.path.isfile(f):
|
||
|
|
logging.error(f + " is not a valid file.")
|
||
|
|
return None
|
||
|
|
|
||
|
|
conf = config.listservs_db
|
||
|
|
db_con = db.utils.connect_db(conf['database'], conf['host'], conf['user'], conf['password'])
|
||
|
|
|
||
|
|
if db_con is None:
|
||
|
|
logging.error("Can not connect to db " + conf['database'] + " @ " + conf['host'])
|
||
|
|
return None
|
||
|
|
|
||
|
|
tables = db.utils.list_all_tables(db_con)
|
||
|
|
if tables is None:
|
||
|
|
logging.error("There are no table in db " + conf['database'] + " @ " + conf['host'])
|
||
|
|
db_con.close()
|
||
|
|
return None
|
||
|
|
|
||
|
|
# filename should be of the type: N.xxxx.xml
|
||
|
|
#ex: 3.Network.xml
|
||
|
|
ch = os.path.basename(f).split('.')[0]
|
||
|
|
|
||
|
|
if indx is None:
|
||
|
|
indx = {}
|
||
|
|
|
||
|
|
root = et.parse(f).getroot()
|
||
|
|
|
||
|
|
to_table_map = {
|
||
|
|
'<new-media-curating@jiscmail.ac.uk>': 'crumb',
|
||
|
|
'spectre@mikrolisten.de': 'spectre',
|
||
|
|
'<empyre@lists.cofa.unsw.edu.au>': 'empyre',
|
||
|
|
'nettime-bold@nettime.org': 'nettime_bold',
|
||
|
|
'nettime-l@desk.nl': 'nettime_l',
|
||
|
|
'nettime-l@desk.nl': 'nettime_l',
|
||
|
|
'mettime-l-temp@material.net': 'nettime_l',
|
||
|
|
'nettime-l@bbs.thing.net': 'nettime_l',
|
||
|
|
'nettime-l@kein.org': 'nettime_l',
|
||
|
|
'oldboys@lists.ccc.de': 'oldboys',
|
||
|
|
'n/a': 'syndicate'
|
||
|
|
}
|
||
|
|
|
||
|
|
try:
|
||
|
|
|
||
|
|
logging.info("-----------------")
|
||
|
|
logging.info(os.path.basename(f))
|
||
|
|
logging.info("-----------------")
|
||
|
|
|
||
|
|
for m in root.findall('mails/mail'):
|
||
|
|
|
||
|
|
nbr_str = m.find('nbr').text
|
||
|
|
to_str = m.find('to').text
|
||
|
|
date_str = m.find('date').text
|
||
|
|
from_str = m.find('from').text
|
||
|
|
|
||
|
|
# format nbr
|
||
|
|
nbr_str = ch + '.' + nbr_str
|
||
|
|
|
||
|
|
if nbr_str in indx:
|
||
|
|
logging.warning(nbr_str + " is already indexed... skipping")
|
||
|
|
continue
|
||
|
|
|
||
|
|
table = to_table_map[to_str]
|
||
|
|
|
||
|
|
logging.info(nbr_str + " - [" + table + "] - " + date_str + " - " + from_str)
|
||
|
|
|
||
|
|
# db_con, date_str, from_str, db_str
|
||
|
|
urls = db.listservs.query_url(db_con, date_str, from_str, table)
|
||
|
|
|
||
|
|
if urls is None or len(urls) == 0:
|
||
|
|
logging.warning("No url for " + nbr_str)
|
||
|
|
continue
|
||
|
|
|
||
|
|
if len(urls) > 1:
|
||
|
|
logging.warning("More than one url for " + nbr_str + "... taking first...")
|
||
|
|
|
||
|
|
indx[nbr_str] = urls[0]
|
||
|
|
|
||
|
|
return indx
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
|
||
|
|
print("aaaaa")
|
||
|
|
|
||
|
|
raise e
|
||
|
|
|
||
|
|
finally:
|
||
|
|
db_con.close()
|
||
|
|
|
||
|
|
def parse_nbr(nbr_str):
|
||
|
|
if '-' in nbr_str:
|
||
|
|
nbr_str = nbr_str.split('-')[0]
|
||
|
|
return tuple([int(j) for j in nbr_str.split('.')])
|
||
|
|
|
||
|
|
|
||
|
|
def save(fn, ind):
|
||
|
|
logging.info("savig work")
|
||
|
|
with open(fn, 'w') as fp:
|
||
|
|
# sort keys
|
||
|
|
ind = collections.OrderedDict(sorted(ind.items(), key=lambda x: parse_nbr(x[0])))
|
||
|
|
json.dump(ind, fp, indent=4, ensure_ascii=False)
|
||
|
|
logging.info("done.")
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == '__main__':
|
||
|
|
|
||
|
|
p = argparse.ArgumentParser(description='Extract urls for mails in xml file')
|
||
|
|
p.add_argument('file', metavar="f", help="xml file to extract urls for")
|
||
|
|
|
||
|
|
ind = {}
|
||
|
|
args = p.parse_args()
|
||
|
|
if args.file == "all":
|
||
|
|
try:
|
||
|
|
urls = os.path.join(config.index['path'], config.index['urls'])
|
||
|
|
with open(urls) as fp:
|
||
|
|
ind = json.load(fp)
|
||
|
|
xml_files = list_all(config.xml['path'], 'xml')
|
||
|
|
for x in xml_files:
|
||
|
|
if index(x, indx=ind) is None:
|
||
|
|
logging.error("Error processing - " + x)
|
||
|
|
save(urls, ind)
|
||
|
|
except KeyboardInterrupt:
|
||
|
|
save(urls, ind)
|
||
|
|
# logging.info("savig work")
|
||
|
|
# with open(urls, 'w') as fp:
|
||
|
|
# # sort keys
|
||
|
|
# ind = collections.OrderedDict(sorted(ind.items(), key=lambda x: tuple([int(j) for j in x[0].split('.')])))
|
||
|
|
# json.dump(ind, fp, indent=4, ensure_ascii=False)
|
||
|
|
# logging.info("done.")
|
||
|
|
else:
|
||
|
|
ind = index(args.file)
|
||
|
|
print(json.dumps(ind, indent=4, sort_keys=True, ensure_ascii=False))
|
||
|
|
|
||
|
|
|
||
|
|
|