177 lines
4.9 KiB
Python
177 lines
4.9 KiB
Python
|
|
import mysql.connector as mariadb
|
||
|
|
import os, json, glob, logging
|
||
|
|
from lxml import etree as et
|
||
|
|
import db.sql, db.utils
|
||
|
|
|
||
|
|
class DB:
|
||
|
|
|
||
|
|
db_con = None
|
||
|
|
|
||
|
|
def __init__(self, config):
|
||
|
|
self.db_con = db.utils.connect_db(config['database'], config['host'], config['user'], config['password'])
|
||
|
|
|
||
|
|
def __enter__(self):
|
||
|
|
return self
|
||
|
|
|
||
|
|
def __exit__(self, exc_type, exc_value, traceback):
|
||
|
|
if self.db_con is not None:
|
||
|
|
self.db_con.close()
|
||
|
|
|
||
|
|
def create_db(self, config=None):
|
||
|
|
|
||
|
|
logging.info("creating table 'full_digest_rescheduled'")
|
||
|
|
if self.db_con is None:
|
||
|
|
if config is not None:
|
||
|
|
self.db_con = db.utils.connect_db(config['database'], config['host'], config['user'], config['password'])
|
||
|
|
|
||
|
|
if self.db_con is None:
|
||
|
|
logging.warning(" - no connection... Aborting.")
|
||
|
|
return
|
||
|
|
|
||
|
|
try:
|
||
|
|
cursor = self.db_con.cursor()
|
||
|
|
cursor.execute(db.sql.CREATE)
|
||
|
|
except mariadb.Error as error:
|
||
|
|
logging.error("Error: {}".format(error))
|
||
|
|
finally:
|
||
|
|
cursor.close()
|
||
|
|
|
||
|
|
logging.info(" - done.")
|
||
|
|
|
||
|
|
def insert_db(self, xml_file, urls_index_file, config=None):
|
||
|
|
|
||
|
|
if self.db_con is None:
|
||
|
|
if config is not None:
|
||
|
|
self.db_con = db.utils.connect_db(config['database'], config['host'], config['user'], config['password'])
|
||
|
|
|
||
|
|
if self.db_con is None:
|
||
|
|
logging.warning(" - no connection... Aborting.")
|
||
|
|
return
|
||
|
|
|
||
|
|
if not os.path.isfile(xml_file):
|
||
|
|
logging.error(f + " is not a valid file.")
|
||
|
|
return None
|
||
|
|
|
||
|
|
# urls_index_file = os.path.join(config.index['path'], config.index['urls'])
|
||
|
|
if not os.path.isfile(urls_index_file):
|
||
|
|
logging.error(urls_index_file + " is not a valid file.")
|
||
|
|
return None
|
||
|
|
|
||
|
|
with open(urls_index_file) as fp:
|
||
|
|
urls = json.load(fp)
|
||
|
|
|
||
|
|
ch = os.path.basename(xml_file).split('.')[0]
|
||
|
|
|
||
|
|
root = et.parse(xml_file).getroot()
|
||
|
|
|
||
|
|
try:
|
||
|
|
logging.info("-----------------")
|
||
|
|
logging.info(os.path.basename(xml_file))
|
||
|
|
logging.info("-----------------")
|
||
|
|
|
||
|
|
cursor = self.db_con.cursor()
|
||
|
|
|
||
|
|
for m in root.findall('mails/mail'):
|
||
|
|
|
||
|
|
nbr_str = m.find('nbr').text
|
||
|
|
to_str = m.find('to').text
|
||
|
|
date_str = m.find('date').text
|
||
|
|
from_str = m.find('from').text
|
||
|
|
subject_str = m.find('subject').text
|
||
|
|
content_str = m.find('content').text
|
||
|
|
|
||
|
|
# format nbr
|
||
|
|
nbr_str = ch + '.' + nbr_str
|
||
|
|
|
||
|
|
if nbr_str not in urls:
|
||
|
|
logging.warning(nbr_str + " is not in urls... skipping.")
|
||
|
|
continue
|
||
|
|
|
||
|
|
url = urls[nbr_str]
|
||
|
|
|
||
|
|
date = db.utils.format_date(date_str)
|
||
|
|
if date is None:
|
||
|
|
logging.warning("null date: " + nbr_str + " - " + date_str + " - " + from_str)
|
||
|
|
logging.warning("continuing...")
|
||
|
|
continue
|
||
|
|
|
||
|
|
# aaarrrgghhh
|
||
|
|
if to_str == "n/a":
|
||
|
|
to_str = "syndicate@aec.at"
|
||
|
|
|
||
|
|
try:
|
||
|
|
# (nbr_, author_name_, to_, subject_, date_, content_, url_)
|
||
|
|
logging.info("inserting " + nbr_str)
|
||
|
|
r = cursor.execute(db.sql.INSERT, (nbr_str, from_str, to_str, subject_str, date, content_str, url))
|
||
|
|
if r == 0:
|
||
|
|
logging.warning("error no insert...")
|
||
|
|
logging.warning(nbr_str + " - " + from_str + " - " + to_str + " - " + subject_str + " - " + date + " - " + content_str + " - " + url)
|
||
|
|
except mariadb.Error as error:
|
||
|
|
if error.errno == 1062:
|
||
|
|
logging.info("+++db+++ duplicate")
|
||
|
|
continue
|
||
|
|
else:
|
||
|
|
logging.warning("Error...")
|
||
|
|
logging.warning(nbr_str + " - " + from_str + " - " + to_str + " - " + subject_str + " - " + date + " - " + content_str + " - " + url)
|
||
|
|
continue
|
||
|
|
|
||
|
|
self.db_con.commit()
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
raise e
|
||
|
|
finally:
|
||
|
|
cursor.close()
|
||
|
|
|
||
|
|
|
||
|
|
def content_search(self, term, bool=True):
|
||
|
|
|
||
|
|
if self.db_con is None:
|
||
|
|
logging.warning("Not connection to database...")
|
||
|
|
return
|
||
|
|
|
||
|
|
try:
|
||
|
|
cursor = self.db_con.cursor(buffered=True)
|
||
|
|
if bool:
|
||
|
|
cursor.execute(db.sql.CONTENT_QUERY_BOOLEAN.format(self.archive_name, term))
|
||
|
|
else:
|
||
|
|
cursor.execute(db.sql.CONTENT_QUERY.format(self.archive_name, term))
|
||
|
|
|
||
|
|
# nbr_, author_name_, to_, subject_, date_, url_
|
||
|
|
results = []
|
||
|
|
for (nbr_, author_name_, to_, subject_, date_, url_) in cursor:
|
||
|
|
results.append({'nbr': nbr_, 'from': author_name_, 'to': to_, 'subject': subject_, 'date': date_, 'url': url_})
|
||
|
|
# print("{} {} {}".format(from_, str(date_), url_))
|
||
|
|
return results
|
||
|
|
|
||
|
|
except mariadb.Error as error:
|
||
|
|
logging.error("Error: {}".format(error))
|
||
|
|
finally:
|
||
|
|
cursor.close()
|
||
|
|
|
||
|
|
def from_search(self, term, bool=True):
|
||
|
|
|
||
|
|
if self.db_con is None:
|
||
|
|
logging.warning("Not connection to database...")
|
||
|
|
return
|
||
|
|
|
||
|
|
try:
|
||
|
|
cursor = self.db_con.cursor(buffered=True)
|
||
|
|
if bool:
|
||
|
|
cursor.execute(archive.sql.FROM_QUERY_BOOLEAN.format(self.archive_name, term))
|
||
|
|
else:
|
||
|
|
cursor.execute(archive.sql.FROM_QUERY.format(self.archive_name, term))
|
||
|
|
|
||
|
|
# print(cursor.rowcount)
|
||
|
|
results = []
|
||
|
|
for (from_, author_name_, subject_, date_, url_) in cursor:
|
||
|
|
results.append((from_, author_name_, subject_, date_, url_))
|
||
|
|
# print("{} {} {}".format(from_, str(date_), url_))
|
||
|
|
return results
|
||
|
|
|
||
|
|
except mariadb.Error as error:
|
||
|
|
logging.error("Error: {}".format(error))
|
||
|
|
finally:
|
||
|
|
cursor.close()
|
||
|
|
|
||
|
|
|