177 lines
4.8 KiB
Python
Raw Permalink Normal View History

2020-01-25 10:57:13 +01:00
import mysql.connector as mariadb
import os, json, glob, logging
from lxml import etree as et
import db.sql, db.utils
class DB:
db_con = None
def __init__(self, config):
self.db_con = db.utils.connect_db(config['database'], config['host'], config['user'], config['password'])
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
if self.db_con is not None:
self.db_con.close()
def create_db(self, config=None):
logging.info("creating table 'full_digest_rescheduled'")
if self.db_con is None:
if config is not None:
self.db_con = db.utils.connect_db(config['database'], config['host'], config['user'], config['password'])
if self.db_con is None:
logging.warning(" - no connection... Aborting.")
return
try:
cursor = self.db_con.cursor()
cursor.execute(db.sql.CREATE)
except mariadb.Error as error:
logging.error("Error: {}".format(error))
finally:
cursor.close()
logging.info(" - done.")
def insert_db(self, xml_file, urls_index_file, config=None):
if self.db_con is None:
if config is not None:
self.db_con = db.utils.connect_db(config['database'], config['host'], config['user'], config['password'])
if self.db_con is None:
logging.warning(" - no connection... Aborting.")
return
if not os.path.isfile(xml_file):
logging.error(f + " is not a valid file.")
return None
# urls_index_file = os.path.join(config.index['path'], config.index['urls'])
if not os.path.isfile(urls_index_file):
logging.error(urls_index_file + " is not a valid file.")
return None
with open(urls_index_file) as fp:
urls = json.load(fp)
ch = os.path.basename(xml_file).split('.')[0]
root = et.parse(xml_file).getroot()
try:
logging.info("-----------------")
logging.info(os.path.basename(xml_file))
logging.info("-----------------")
cursor = self.db_con.cursor()
for m in root.findall('mails/mail'):
nbr_str = m.find('nbr').text
to_str = m.find('to').text
date_str = m.find('date').text
from_str = m.find('from').text
subject_str = m.find('subject').text
content_str = m.find('content').text
# format nbr
nbr_str = ch + '.' + nbr_str
if nbr_str not in urls:
logging.warning(nbr_str + " is not in urls... skipping.")
continue
url = urls[nbr_str]
date = db.utils.format_date(date_str)
if date is None:
logging.warning("null date: " + nbr_str + " - " + date_str + " - " + from_str)
logging.warning("continuing...")
continue
# aaarrrgghhh
if to_str == "n/a":
to_str = "syndicate@aec.at"
try:
# (nbr_, author_name_, to_, subject_, date_, content_, url_)
logging.info("inserting " + nbr_str)
r = cursor.execute(db.sql.INSERT, (nbr_str, from_str, to_str, subject_str, date, content_str, url))
if r == 0:
logging.warning("error no insert...")
logging.warning(nbr_str + " - " + from_str + " - " + to_str + " - " + subject_str + " - " + date + " - " + content_str + " - " + url)
except mariadb.Error as error:
if error.errno == 1062:
logging.info("+++db+++ duplicate")
continue
else:
logging.warning("Error...")
logging.warning(nbr_str + " - " + from_str + " - " + to_str + " - " + subject_str + " - " + date + " - " + content_str + " - " + url)
continue
self.db_con.commit()
except Exception as e:
raise e
finally:
cursor.close()
def content_search(self, term, bool=True):
if self.db_con is None:
logging.warning("Not connection to database...")
return
try:
cursor = self.db_con.cursor(buffered=True)
if bool:
cursor.execute(db.sql.CONTENT_QUERY_BOOLEAN.format(term))
2020-01-25 10:57:13 +01:00
else:
cursor.execute(db.sql.CONTENT_QUERY.format(term))
2020-01-25 10:57:13 +01:00
# nbr_, author_name_, to_, subject_, date_, url_
results = []
for (nbr_, author_name_, to_, subject_, date_, url_) in cursor:
results.append({'nbr': nbr_, 'from': author_name_, 'to': to_, 'subject': subject_, 'date': date_, 'url': url_})
# print("{} {} {}".format(from_, str(date_), url_))
return results
except mariadb.Error as error:
logging.error("Error: {}".format(error))
finally:
cursor.close()
def from_search(self, term, bool=True):
if self.db_con is None:
logging.warning("Not connection to database...")
return
try:
cursor = self.db_con.cursor(buffered=True)
if bool:
cursor.execute(db.sql.FROM_QUERY_BOOLEAN.format(term))
2020-01-25 10:57:13 +01:00
else:
cursor.execute(db.sql.FROM_QUERY.format(term))
2020-01-25 10:57:13 +01:00
# print(cursor.rowcount)
results = []
for (from_, author_name_, subject_, date_, url_) in cursor:
results.append((from_, author_name_, subject_, date_, url_))
# print("{} {} {}".format(from_, str(date_), url_))
return results
except mariadb.Error as error:
logging.error("Error: {}".format(error))
finally:
cursor.close()