import argparse, os, glob, json, logging, collections import db.utils, db.listservs from lxml import etree as et import mysql.connector as mariadb import config logging.basicConfig(level=logging.DEBUG) def list_all(d, ext): if not os.path.isdir(d): logging.error(d + " is not a valid directory.") return None return [f for f in glob.glob(os.path.join(d, "*." + ext))] def index(f, indx=None): if not os.path.isfile(f): logging.error(f + " is not a valid file.") return None conf = config.listservs_db db_con = db.utils.connect_db(conf['database'], conf['host'], conf['user'], conf['password']) if db_con is None: logging.error("Can not connect to db " + conf['database'] + " @ " + conf['host']) return None tables = db.utils.list_all_tables(db_con) if tables is None: logging.error("There are no table in db " + conf['database'] + " @ " + conf['host']) db_con.close() return None # filename should be of the type: N.xxxx.xml #ex: 3.Network.xml ch = os.path.basename(f).split('.')[0] if indx is None: indx = {} root = et.parse(f).getroot() to_table_map = { '': 'crumb', 'spectre@mikrolisten.de': 'spectre', '': 'empyre', 'nettime-bold@nettime.org': 'nettime_bold', 'nettime-l@desk.nl': 'nettime_l', 'nettime-l@desk.nl': 'nettime_l', 'mettime-l-temp@material.net': 'nettime_l', 'nettime-l@bbs.thing.net': 'nettime_l', 'nettime-l@kein.org': 'nettime_l', 'oldboys@lists.ccc.de': 'oldboys', 'n/a': 'syndicate' } try: logging.info("-----------------") logging.info(os.path.basename(f)) logging.info("-----------------") for m in root.findall('mails/mail'): nbr_str = m.find('nbr').text to_str = m.find('to').text date_str = m.find('date').text from_str = m.find('from').text # format nbr nbr_str = ch + '.' + nbr_str if nbr_str in indx: logging.warning(nbr_str + " is already indexed... skipping") continue table = to_table_map[to_str] logging.info(nbr_str + " - [" + table + "] - " + date_str + " - " + from_str) # db_con, date_str, from_str, db_str urls = db.listservs.query_url(db_con, date_str, from_str, table) if urls is None or len(urls) == 0: logging.warning("No url for " + nbr_str) continue if len(urls) > 1: logging.warning("More than one url for " + nbr_str + "... taking first...") indx[nbr_str] = urls[0] return indx except Exception as e: print("aaaaa") raise e finally: db_con.close() def parse_nbr(nbr_str): if '-' in nbr_str: nbr_str = nbr_str.split('-')[0] return tuple([int(j) for j in nbr_str.split('.')]) def save(fn, ind): logging.info("savig work") with open(fn, 'w') as fp: # sort keys ind = collections.OrderedDict(sorted(ind.items(), key=lambda x: parse_nbr(x[0]))) json.dump(ind, fp, indent=4, ensure_ascii=False) logging.info("done.") if __name__ == '__main__': p = argparse.ArgumentParser(description='Extract urls for mails in xml file') p.add_argument('file', metavar="f", help="xml file to extract urls for") ind = {} args = p.parse_args() if args.file == "all": try: urls = os.path.join(config.index['path'], config.index['urls']) with open(urls) as fp: ind = json.load(fp) xml_files = list_all(config.xml['path'], 'xml') for x in xml_files: if index(x, indx=ind) is None: logging.error("Error processing - " + x) save(urls, ind) except KeyboardInterrupt: save(urls, ind) # logging.info("savig work") # with open(urls, 'w') as fp: # # sort keys # ind = collections.OrderedDict(sorted(ind.items(), key=lambda x: tuple([int(j) for j in x[0].split('.')]))) # json.dump(ind, fp, indent=4, ensure_ascii=False) # logging.info("done.") else: ind = index(args.file) print(json.dumps(ind, indent=4, sort_keys=True, ensure_ascii=False))