index + db
This commit is contained in:
@@ -0,0 +1,176 @@
|
||||
import mysql.connector as mariadb
|
||||
import os, json, glob, logging
|
||||
from lxml import etree as et
|
||||
import db.sql, db.utils
|
||||
|
||||
class DB:
|
||||
|
||||
db_con = None
|
||||
|
||||
def __init__(self, config):
|
||||
self.db_con = db.utils.connect_db(config['database'], config['host'], config['user'], config['password'])
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
if self.db_con is not None:
|
||||
self.db_con.close()
|
||||
|
||||
def create_db(self, config=None):
|
||||
|
||||
logging.info("creating table 'full_digest_rescheduled'")
|
||||
if self.db_con is None:
|
||||
if config is not None:
|
||||
self.db_con = db.utils.connect_db(config['database'], config['host'], config['user'], config['password'])
|
||||
|
||||
if self.db_con is None:
|
||||
logging.warning(" - no connection... Aborting.")
|
||||
return
|
||||
|
||||
try:
|
||||
cursor = self.db_con.cursor()
|
||||
cursor.execute(db.sql.CREATE)
|
||||
except mariadb.Error as error:
|
||||
logging.error("Error: {}".format(error))
|
||||
finally:
|
||||
cursor.close()
|
||||
|
||||
logging.info(" - done.")
|
||||
|
||||
def insert_db(self, xml_file, urls_index_file, config=None):
|
||||
|
||||
if self.db_con is None:
|
||||
if config is not None:
|
||||
self.db_con = db.utils.connect_db(config['database'], config['host'], config['user'], config['password'])
|
||||
|
||||
if self.db_con is None:
|
||||
logging.warning(" - no connection... Aborting.")
|
||||
return
|
||||
|
||||
if not os.path.isfile(xml_file):
|
||||
logging.error(f + " is not a valid file.")
|
||||
return None
|
||||
|
||||
# urls_index_file = os.path.join(config.index['path'], config.index['urls'])
|
||||
if not os.path.isfile(urls_index_file):
|
||||
logging.error(urls_index_file + " is not a valid file.")
|
||||
return None
|
||||
|
||||
with open(urls_index_file) as fp:
|
||||
urls = json.load(fp)
|
||||
|
||||
ch = os.path.basename(xml_file).split('.')[0]
|
||||
|
||||
root = et.parse(xml_file).getroot()
|
||||
|
||||
try:
|
||||
logging.info("-----------------")
|
||||
logging.info(os.path.basename(xml_file))
|
||||
logging.info("-----------------")
|
||||
|
||||
cursor = self.db_con.cursor()
|
||||
|
||||
for m in root.findall('mails/mail'):
|
||||
|
||||
nbr_str = m.find('nbr').text
|
||||
to_str = m.find('to').text
|
||||
date_str = m.find('date').text
|
||||
from_str = m.find('from').text
|
||||
subject_str = m.find('subject').text
|
||||
content_str = m.find('content').text
|
||||
|
||||
# format nbr
|
||||
nbr_str = ch + '.' + nbr_str
|
||||
|
||||
if nbr_str not in urls:
|
||||
logging.warning(nbr_str + " is not in urls... skipping.")
|
||||
continue
|
||||
|
||||
url = urls[nbr_str]
|
||||
|
||||
date = db.utils.format_date(date_str)
|
||||
if date is None:
|
||||
logging.warning("null date: " + nbr_str + " - " + date_str + " - " + from_str)
|
||||
logging.warning("continuing...")
|
||||
continue
|
||||
|
||||
# aaarrrgghhh
|
||||
if to_str == "n/a":
|
||||
to_str = "syndicate@aec.at"
|
||||
|
||||
try:
|
||||
# (nbr_, author_name_, to_, subject_, date_, content_, url_)
|
||||
logging.info("inserting " + nbr_str)
|
||||
r = cursor.execute(db.sql.INSERT, (nbr_str, from_str, to_str, subject_str, date, content_str, url))
|
||||
if r == 0:
|
||||
logging.warning("error no insert...")
|
||||
logging.warning(nbr_str + " - " + from_str + " - " + to_str + " - " + subject_str + " - " + date + " - " + content_str + " - " + url)
|
||||
except mariadb.Error as error:
|
||||
if error.errno == 1062:
|
||||
logging.info("+++db+++ duplicate")
|
||||
continue
|
||||
else:
|
||||
logging.warning("Error...")
|
||||
logging.warning(nbr_str + " - " + from_str + " - " + to_str + " - " + subject_str + " - " + date + " - " + content_str + " - " + url)
|
||||
continue
|
||||
|
||||
self.db_con.commit()
|
||||
|
||||
except Exception as e:
|
||||
raise e
|
||||
finally:
|
||||
cursor.close()
|
||||
|
||||
|
||||
def content_search(self, term, bool=True):
|
||||
|
||||
if self.db_con is None:
|
||||
logging.warning("Not connection to database...")
|
||||
return
|
||||
|
||||
try:
|
||||
cursor = self.db_con.cursor(buffered=True)
|
||||
if bool:
|
||||
cursor.execute(db.sql.CONTENT_QUERY_BOOLEAN.format(self.archive_name, term))
|
||||
else:
|
||||
cursor.execute(db.sql.CONTENT_QUERY.format(self.archive_name, term))
|
||||
|
||||
# nbr_, author_name_, to_, subject_, date_, url_
|
||||
results = []
|
||||
for (nbr_, author_name_, to_, subject_, date_, url_) in cursor:
|
||||
results.append({'nbr': nbr_, 'from': author_name_, 'to': to_, 'subject': subject_, 'date': date_, 'url': url_})
|
||||
# print("{} {} {}".format(from_, str(date_), url_))
|
||||
return results
|
||||
|
||||
except mariadb.Error as error:
|
||||
logging.error("Error: {}".format(error))
|
||||
finally:
|
||||
cursor.close()
|
||||
|
||||
def from_search(self, term, bool=True):
|
||||
|
||||
if self.db_con is None:
|
||||
logging.warning("Not connection to database...")
|
||||
return
|
||||
|
||||
try:
|
||||
cursor = self.db_con.cursor(buffered=True)
|
||||
if bool:
|
||||
cursor.execute(archive.sql.FROM_QUERY_BOOLEAN.format(self.archive_name, term))
|
||||
else:
|
||||
cursor.execute(archive.sql.FROM_QUERY.format(self.archive_name, term))
|
||||
|
||||
# print(cursor.rowcount)
|
||||
results = []
|
||||
for (from_, author_name_, subject_, date_, url_) in cursor:
|
||||
results.append((from_, author_name_, subject_, date_, url_))
|
||||
# print("{} {} {}".format(from_, str(date_), url_))
|
||||
return results
|
||||
|
||||
except mariadb.Error as error:
|
||||
logging.error("Error: {}".format(error))
|
||||
finally:
|
||||
cursor.close()
|
||||
|
||||
|
||||
@@ -0,0 +1,26 @@
|
||||
import mysql.connector as mariadb
|
||||
import dateparser, logging
|
||||
import db.sql, db.utils
|
||||
|
||||
|
||||
def query_url(db_con, date_str, from_str, table_str):
|
||||
|
||||
d = db.utils.format_date(date_str)
|
||||
auth = db.utils.format_author(from_str)
|
||||
|
||||
try:
|
||||
result = []
|
||||
cursor = db_con.cursor(buffered=True)
|
||||
cursor.execute(db.sql.URL_QUERY.format(table_str, auth, d))
|
||||
for u in cursor:
|
||||
result.append(u[0])
|
||||
return result
|
||||
except mariadb.Error as error:
|
||||
logging.error("Mariadb error - query_url")
|
||||
logging.error(error)
|
||||
return None
|
||||
except Exception as e:
|
||||
logging.error("Error - query_url")
|
||||
return None
|
||||
finally:
|
||||
cursor.close()
|
||||
@@ -0,0 +1,34 @@
|
||||
CREATE = "CREATE TABLE `full_digest_rescheduled` (" \
|
||||
"`nbr_` varchar(20) NOT NULL," \
|
||||
"`author_name_` varchar(200) NOT NULL," \
|
||||
"`to_` varchar(60) NOT NULL," \
|
||||
"`subject_` varchar(500) NOT NULL," \
|
||||
"`date_` datetime NOT NULL," \
|
||||
"`content_` mediumtext NOT NULL," \
|
||||
"`url_` varchar(100) NOT NULL," \
|
||||
"PRIMARY KEY(`nbr_`)," \
|
||||
"FULLTEXT (`subject_`, `content_`)," \
|
||||
"FULLTEXT (`author_name_`)" \
|
||||
") ENGINE = InnoDB;"
|
||||
|
||||
|
||||
INSERT = ("INSERT INTO full_digest_rescheduled"
|
||||
"(nbr_, author_name_, to_, subject_, date_, content_, url_) "
|
||||
"VALUES (%s, %s, %s, %s, %s, %s, %s)")
|
||||
|
||||
CONTENT_QUERY_BOOLEAN = ("SELECT nbr_, author_name_, to_, subject_, date_, url_ from full_digest_rescheduled "
|
||||
"WHERE MATCH(subject_, content_) AGAINST('{}' IN BOOLEAN MODE) ORDER BY date_")
|
||||
|
||||
CONTENT_QUERY_NL = ("SELECT nbr_, author_name_, to_, subject_, date_, url_ from full_digest_rescheduled "
|
||||
"WHERE MATCH(subject_, content_) AGAINST('{}') ORDER BY date_")
|
||||
|
||||
FROM_QUERY_BOOLEAN = ("SELECT nbr_, author_name_, to_, subject_, date_, url_ from full_digest_rescheduled "
|
||||
"WHERE MATCH(author_name_) AGAINST('{}' IN BOOLEAN MODE) ORDER BY date_")
|
||||
|
||||
FROM_QUERY_NL = ("SELECT nbr_, author_name_, to_, subject_, date_, url_ from full_digest_rescheduled "
|
||||
"WHERE MATCH(author_name_) AGAINST('{}') ORDER BY date_")
|
||||
|
||||
URL_QUERY = ('SELECT url_ FROM {} '
|
||||
'WHERE author_name_="{}" AND date_="{}"')
|
||||
|
||||
SHOW_TABLE = "show tables"
|
||||
+125
@@ -0,0 +1,125 @@
|
||||
import mysql.connector as mariadb
|
||||
import db.sql
|
||||
import dateparser
|
||||
|
||||
def connect_db(database, host, user, password):
|
||||
try:
|
||||
con = mariadb.connect(host=host, user=user, password=password, database=database)
|
||||
except mariadb.Error as error:
|
||||
logging.error("Error: {}".format(error))
|
||||
if error.errno == 1049:
|
||||
logging.error("Database " + database + " does not exist.")
|
||||
return None
|
||||
finally:
|
||||
return con
|
||||
|
||||
def list_all_tables(db_con):
|
||||
try:
|
||||
cursor = db_con.cursor()
|
||||
cursor.execute(db.sql.SHOW_TABLE)
|
||||
results = []
|
||||
for t in cursor:
|
||||
results.append(t[0])
|
||||
return results
|
||||
except mariadb.Error as error:
|
||||
logging.error("Error: {}".format(error))
|
||||
return None
|
||||
finally:
|
||||
cursor.close()
|
||||
|
||||
def format_date(date_str):
|
||||
|
||||
date_time = dateparser.parse(date_str)
|
||||
if date_time is not None:
|
||||
return date_time
|
||||
|
||||
if '(' in date_str:
|
||||
date_str = date_str.split('(')[0].rstrip()
|
||||
|
||||
|
||||
date_time = dateparser.parse(date_str)
|
||||
if date_time is not None:
|
||||
return date_time
|
||||
|
||||
else:
|
||||
# random stuff...
|
||||
fix = False
|
||||
toks = date_str.split()
|
||||
|
||||
if len(toks[-1]) == 5 or len(toks[-1]) == 4:
|
||||
# ex. Thu, 24 Jan 2002 15:21:31 -0000
|
||||
if toks[-1] in ['+0000', '-0000', '0000']:
|
||||
date_str = date_str[:-5]
|
||||
fix = True
|
||||
# ex. Fri, 25 Jan 2002 13:21:49 +1050
|
||||
elif toks[-1][-2] == '5':
|
||||
d = list(date_str)
|
||||
d[-2] = '3'
|
||||
date_str = "".join(d)
|
||||
fix = True
|
||||
|
||||
if toks[-1][-1] != '0':
|
||||
#ex. 'Fri,', '20', 'Jun', '1997', '02:58:59', '-0005'
|
||||
date_str = date_str[:-5]
|
||||
fix = True
|
||||
|
||||
if 'Fru' in toks[0]:
|
||||
date_str = date_str.replace('Fru', 'Fri')
|
||||
fix = True
|
||||
elif 'Thur' in toks[0]:
|
||||
date_str = date_str.replace('Thur', 'Thu')
|
||||
fix = True
|
||||
|
||||
date_time = dateparser.parse(date_str)
|
||||
if date_time is not None:
|
||||
return date_time
|
||||
|
||||
else:
|
||||
|
||||
if 'GMT' in date_str:
|
||||
# ex. 'Mon,', '15', 'Jan', '96', '02:55', 'GMT+0100'
|
||||
date_str = date_str.split('GMT')[0].rstrip()
|
||||
fix = True
|
||||
|
||||
if 'METDST' in toks[-1]:
|
||||
# ex. 'Sat,', '3', 'May', '97', '21:07', 'METDST'
|
||||
date_str = date_str.replace('METDST', 'MET')
|
||||
fix = True
|
||||
|
||||
date_time = dateparser.parse(date_str)
|
||||
if date_time is not None:
|
||||
return date_time
|
||||
else:
|
||||
return None
|
||||
|
||||
def format_author(author_str):
|
||||
|
||||
# author_str = author_str.replace('"', '')
|
||||
|
||||
if "by way of" in author_str:
|
||||
toks = author_str.split("by way of")
|
||||
if toks[0] == "":
|
||||
author_str = format_from(msg)
|
||||
elif toks[0][-1] == "(":
|
||||
author_str = toks[0][:-1].strip()
|
||||
else:
|
||||
author_str = toks[0]
|
||||
|
||||
if ("(" in author_str) or ("<" in author_str):
|
||||
# ex. zx {AT} xyz.net (Michel Foucault) OR Michel Foucault (c'estcommeca.com) OR Michel Foucault <zx {AT} xyz.net>
|
||||
# print("±±±±±±")
|
||||
# print("name: " + author_str)
|
||||
# print("from: " + msg['from'])
|
||||
if not '@' in author_str.lower().replace('{at}', '@').replace(' at ', '@'):
|
||||
author_str = author_str.split('(')[0].strip()
|
||||
else:
|
||||
author_str = email.utils.parseaddr(author_str)[0]
|
||||
# print(" Name:" + author_str.replace('"', ''))
|
||||
# print(" From:" + format_from(msg))
|
||||
|
||||
if " ," in author_str:
|
||||
# nettime's_roving_reporter , thing.net {AT} bbs.thing.net
|
||||
author_str = author_str.split(' ,')[0]
|
||||
|
||||
|
||||
return author_str
|
||||
Reference in New Issue
Block a user