index + db

This commit is contained in:
gauthiier
2020-01-25 10:57:13 +01:00
parent afc71795d1
commit 56aab9e545
19 changed files with 6027 additions and 158 deletions
View File
+176
View File
@@ -0,0 +1,176 @@
import mysql.connector as mariadb
import os, json, glob, logging
from lxml import etree as et
import db.sql, db.utils
class DB:
db_con = None
def __init__(self, config):
self.db_con = db.utils.connect_db(config['database'], config['host'], config['user'], config['password'])
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
if self.db_con is not None:
self.db_con.close()
def create_db(self, config=None):
logging.info("creating table 'full_digest_rescheduled'")
if self.db_con is None:
if config is not None:
self.db_con = db.utils.connect_db(config['database'], config['host'], config['user'], config['password'])
if self.db_con is None:
logging.warning(" - no connection... Aborting.")
return
try:
cursor = self.db_con.cursor()
cursor.execute(db.sql.CREATE)
except mariadb.Error as error:
logging.error("Error: {}".format(error))
finally:
cursor.close()
logging.info(" - done.")
def insert_db(self, xml_file, urls_index_file, config=None):
if self.db_con is None:
if config is not None:
self.db_con = db.utils.connect_db(config['database'], config['host'], config['user'], config['password'])
if self.db_con is None:
logging.warning(" - no connection... Aborting.")
return
if not os.path.isfile(xml_file):
logging.error(f + " is not a valid file.")
return None
# urls_index_file = os.path.join(config.index['path'], config.index['urls'])
if not os.path.isfile(urls_index_file):
logging.error(urls_index_file + " is not a valid file.")
return None
with open(urls_index_file) as fp:
urls = json.load(fp)
ch = os.path.basename(xml_file).split('.')[0]
root = et.parse(xml_file).getroot()
try:
logging.info("-----------------")
logging.info(os.path.basename(xml_file))
logging.info("-----------------")
cursor = self.db_con.cursor()
for m in root.findall('mails/mail'):
nbr_str = m.find('nbr').text
to_str = m.find('to').text
date_str = m.find('date').text
from_str = m.find('from').text
subject_str = m.find('subject').text
content_str = m.find('content').text
# format nbr
nbr_str = ch + '.' + nbr_str
if nbr_str not in urls:
logging.warning(nbr_str + " is not in urls... skipping.")
continue
url = urls[nbr_str]
date = db.utils.format_date(date_str)
if date is None:
logging.warning("null date: " + nbr_str + " - " + date_str + " - " + from_str)
logging.warning("continuing...")
continue
# aaarrrgghhh
if to_str == "n/a":
to_str = "syndicate@aec.at"
try:
# (nbr_, author_name_, to_, subject_, date_, content_, url_)
logging.info("inserting " + nbr_str)
r = cursor.execute(db.sql.INSERT, (nbr_str, from_str, to_str, subject_str, date, content_str, url))
if r == 0:
logging.warning("error no insert...")
logging.warning(nbr_str + " - " + from_str + " - " + to_str + " - " + subject_str + " - " + date + " - " + content_str + " - " + url)
except mariadb.Error as error:
if error.errno == 1062:
logging.info("+++db+++ duplicate")
continue
else:
logging.warning("Error...")
logging.warning(nbr_str + " - " + from_str + " - " + to_str + " - " + subject_str + " - " + date + " - " + content_str + " - " + url)
continue
self.db_con.commit()
except Exception as e:
raise e
finally:
cursor.close()
def content_search(self, term, bool=True):
if self.db_con is None:
logging.warning("Not connection to database...")
return
try:
cursor = self.db_con.cursor(buffered=True)
if bool:
cursor.execute(db.sql.CONTENT_QUERY_BOOLEAN.format(self.archive_name, term))
else:
cursor.execute(db.sql.CONTENT_QUERY.format(self.archive_name, term))
# nbr_, author_name_, to_, subject_, date_, url_
results = []
for (nbr_, author_name_, to_, subject_, date_, url_) in cursor:
results.append({'nbr': nbr_, 'from': author_name_, 'to': to_, 'subject': subject_, 'date': date_, 'url': url_})
# print("{} {} {}".format(from_, str(date_), url_))
return results
except mariadb.Error as error:
logging.error("Error: {}".format(error))
finally:
cursor.close()
def from_search(self, term, bool=True):
if self.db_con is None:
logging.warning("Not connection to database...")
return
try:
cursor = self.db_con.cursor(buffered=True)
if bool:
cursor.execute(archive.sql.FROM_QUERY_BOOLEAN.format(self.archive_name, term))
else:
cursor.execute(archive.sql.FROM_QUERY.format(self.archive_name, term))
# print(cursor.rowcount)
results = []
for (from_, author_name_, subject_, date_, url_) in cursor:
results.append((from_, author_name_, subject_, date_, url_))
# print("{} {} {}".format(from_, str(date_), url_))
return results
except mariadb.Error as error:
logging.error("Error: {}".format(error))
finally:
cursor.close()
+26
View File
@@ -0,0 +1,26 @@
import mysql.connector as mariadb
import dateparser, logging
import db.sql, db.utils
def query_url(db_con, date_str, from_str, table_str):
d = db.utils.format_date(date_str)
auth = db.utils.format_author(from_str)
try:
result = []
cursor = db_con.cursor(buffered=True)
cursor.execute(db.sql.URL_QUERY.format(table_str, auth, d))
for u in cursor:
result.append(u[0])
return result
except mariadb.Error as error:
logging.error("Mariadb error - query_url")
logging.error(error)
return None
except Exception as e:
logging.error("Error - query_url")
return None
finally:
cursor.close()
+34
View File
@@ -0,0 +1,34 @@
CREATE = "CREATE TABLE `full_digest_rescheduled` (" \
"`nbr_` varchar(20) NOT NULL," \
"`author_name_` varchar(200) NOT NULL," \
"`to_` varchar(60) NOT NULL," \
"`subject_` varchar(500) NOT NULL," \
"`date_` datetime NOT NULL," \
"`content_` mediumtext NOT NULL," \
"`url_` varchar(100) NOT NULL," \
"PRIMARY KEY(`nbr_`)," \
"FULLTEXT (`subject_`, `content_`)," \
"FULLTEXT (`author_name_`)" \
") ENGINE = InnoDB;"
INSERT = ("INSERT INTO full_digest_rescheduled"
"(nbr_, author_name_, to_, subject_, date_, content_, url_) "
"VALUES (%s, %s, %s, %s, %s, %s, %s)")
CONTENT_QUERY_BOOLEAN = ("SELECT nbr_, author_name_, to_, subject_, date_, url_ from full_digest_rescheduled "
"WHERE MATCH(subject_, content_) AGAINST('{}' IN BOOLEAN MODE) ORDER BY date_")
CONTENT_QUERY_NL = ("SELECT nbr_, author_name_, to_, subject_, date_, url_ from full_digest_rescheduled "
"WHERE MATCH(subject_, content_) AGAINST('{}') ORDER BY date_")
FROM_QUERY_BOOLEAN = ("SELECT nbr_, author_name_, to_, subject_, date_, url_ from full_digest_rescheduled "
"WHERE MATCH(author_name_) AGAINST('{}' IN BOOLEAN MODE) ORDER BY date_")
FROM_QUERY_NL = ("SELECT nbr_, author_name_, to_, subject_, date_, url_ from full_digest_rescheduled "
"WHERE MATCH(author_name_) AGAINST('{}') ORDER BY date_")
URL_QUERY = ('SELECT url_ FROM {} '
'WHERE author_name_="{}" AND date_="{}"')
SHOW_TABLE = "show tables"
+125
View File
@@ -0,0 +1,125 @@
import mysql.connector as mariadb
import db.sql
import dateparser
def connect_db(database, host, user, password):
try:
con = mariadb.connect(host=host, user=user, password=password, database=database)
except mariadb.Error as error:
logging.error("Error: {}".format(error))
if error.errno == 1049:
logging.error("Database " + database + " does not exist.")
return None
finally:
return con
def list_all_tables(db_con):
try:
cursor = db_con.cursor()
cursor.execute(db.sql.SHOW_TABLE)
results = []
for t in cursor:
results.append(t[0])
return results
except mariadb.Error as error:
logging.error("Error: {}".format(error))
return None
finally:
cursor.close()
def format_date(date_str):
date_time = dateparser.parse(date_str)
if date_time is not None:
return date_time
if '(' in date_str:
date_str = date_str.split('(')[0].rstrip()
date_time = dateparser.parse(date_str)
if date_time is not None:
return date_time
else:
# random stuff...
fix = False
toks = date_str.split()
if len(toks[-1]) == 5 or len(toks[-1]) == 4:
# ex. Thu, 24 Jan 2002 15:21:31 -0000
if toks[-1] in ['+0000', '-0000', '0000']:
date_str = date_str[:-5]
fix = True
# ex. Fri, 25 Jan 2002 13:21:49 +1050
elif toks[-1][-2] == '5':
d = list(date_str)
d[-2] = '3'
date_str = "".join(d)
fix = True
if toks[-1][-1] != '0':
#ex. 'Fri,', '20', 'Jun', '1997', '02:58:59', '-0005'
date_str = date_str[:-5]
fix = True
if 'Fru' in toks[0]:
date_str = date_str.replace('Fru', 'Fri')
fix = True
elif 'Thur' in toks[0]:
date_str = date_str.replace('Thur', 'Thu')
fix = True
date_time = dateparser.parse(date_str)
if date_time is not None:
return date_time
else:
if 'GMT' in date_str:
# ex. 'Mon,', '15', 'Jan', '96', '02:55', 'GMT+0100'
date_str = date_str.split('GMT')[0].rstrip()
fix = True
if 'METDST' in toks[-1]:
# ex. 'Sat,', '3', 'May', '97', '21:07', 'METDST'
date_str = date_str.replace('METDST', 'MET')
fix = True
date_time = dateparser.parse(date_str)
if date_time is not None:
return date_time
else:
return None
def format_author(author_str):
# author_str = author_str.replace('"', '')
if "by way of" in author_str:
toks = author_str.split("by way of")
if toks[0] == "":
author_str = format_from(msg)
elif toks[0][-1] == "(":
author_str = toks[0][:-1].strip()
else:
author_str = toks[0]
if ("(" in author_str) or ("<" in author_str):
# ex. zx {AT} xyz.net (Michel Foucault) OR Michel Foucault (c'estcommeca.com) OR Michel Foucault <zx {AT} xyz.net>
# print("±±±±±±")
# print("name: " + author_str)
# print("from: " + msg['from'])
if not '@' in author_str.lower().replace('{at}', '@').replace(' at ', '@'):
author_str = author_str.split('(')[0].strip()
else:
author_str = email.utils.parseaddr(author_str)[0]
# print(" Name:" + author_str.replace('"', ''))
# print(" From:" + format_from(msg))
if " ," in author_str:
# nettime's_roving_reporter , thing.net {AT} bbs.thing.net
author_str = author_str.split(' ,')[0]
return author_str