MEGA -- DB
This commit is contained in:
@@ -0,0 +1,257 @@
|
||||
import email, email.parser
|
||||
import os, json, gzip, re
|
||||
import mysql.connector as mariadb
|
||||
import archive.sql, archive.util
|
||||
from datetime import date, datetime
|
||||
from dateutil import parser
|
||||
import terminal.progress
|
||||
|
||||
def load_from_file(filename, archive_name, archive_dir):
|
||||
|
||||
if not filename.endswith('.json.gz'):
|
||||
file_path = os.path.join(archive_dir, filename + '.json.gz')
|
||||
else:
|
||||
file_path = os.path.join(archive_dir, filename)
|
||||
|
||||
if os.path.isfile(file_path):
|
||||
with gzip.open(file_path, 'r') as fp:
|
||||
json_data = json.load(fp)
|
||||
return (json_data, archive_name)
|
||||
else:
|
||||
#list of all "filename[...].json.gz" in archive_dir
|
||||
files = sorted([f for f in os.listdir(archive_dir) if os.path.isfile(os.path.join(archive_dir, f)) and f.startswith(filename) and f.endswith('.json.gz')])
|
||||
if files:
|
||||
filename = files[-1] # take the most recent (listed alpha-chronological)
|
||||
file_path = os.path.join(archive_dir, filename)
|
||||
if os.path.isfile(file_path):
|
||||
with gzip.open(file_path, 'r') as fp:
|
||||
json_data = json.load(fp)
|
||||
return (json_data, archive_name) # <--- this makes no sense....
|
||||
|
||||
else:
|
||||
#list of all json files in archive_dir/filename
|
||||
dir_path = os.path.join(archive_dir, filename)
|
||||
if not os.path.isdir(dir_path):
|
||||
return None
|
||||
|
||||
files = [os.path.join(dir_path, f) for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f)) and f.endswith('.json')]
|
||||
if not files:
|
||||
return None
|
||||
|
||||
# load all json files
|
||||
threads = []
|
||||
for file_path in files:
|
||||
with open(file_path, 'r') as fp:
|
||||
json_data = json.load(fp)
|
||||
threads.append(json_data)
|
||||
|
||||
return (threads, archive_name)
|
||||
|
||||
def connect_db(database, host, user, password):
|
||||
|
||||
try:
|
||||
con = mariadb.connect(host=host, user=user, password=password, database=database)
|
||||
except mariadb.Error as error:
|
||||
print("Error: {}".format(error))
|
||||
if error.errno == 1049:
|
||||
if util.y_n_question("Table " + archive_name + " does not exist. Create it?"):
|
||||
print("creating")
|
||||
else:
|
||||
print("not creating")
|
||||
return None
|
||||
finally:
|
||||
return con
|
||||
|
||||
|
||||
class Archive:
|
||||
|
||||
data = None # "raw" json data
|
||||
db_con = None
|
||||
|
||||
def __init__(self, archive_name, archive_dir):
|
||||
|
||||
if isinstance(archive_name, str):
|
||||
# need a filename or a dir name....
|
||||
print("reading archive " + archive_name, end='')
|
||||
(self.data, self.archive_name) = load_from_file(archive_name, archive_name, archive_dir)
|
||||
print(" - done.")
|
||||
|
||||
def __init__(self, archive_name, database, host, user, password):
|
||||
|
||||
self.archive_name = archive_name
|
||||
self.db_con = connect_db(database, host, user, password)
|
||||
|
||||
def __init__(self, archive_name, config):
|
||||
|
||||
self.archive_name = archive_name
|
||||
self.db_con = connect_db(config['database'], config['host'], config['user'], config['password'])
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
if self.db_con is not None:
|
||||
self.db_con.close()
|
||||
|
||||
|
||||
def create_db(self, host, database, user, password):
|
||||
|
||||
print("creating table: " + self.archive_name, end='')
|
||||
self.db_con = connect_db(database, host, user, password)
|
||||
if self.db_con is None:
|
||||
return
|
||||
|
||||
try:
|
||||
cursor = self.db_con.cursor()
|
||||
cursor.execute(archive.sql.CREATE.format(self.archive_name))
|
||||
except mariadb.Error as error:
|
||||
print("Error: {}".format(error))
|
||||
finally:
|
||||
cursor.close()
|
||||
|
||||
print(" - done.")
|
||||
|
||||
def insert_db(self, host, database, user, password):
|
||||
|
||||
self.db_con = connect_db(database, host, user, password)
|
||||
|
||||
if self.db_con is None:
|
||||
return
|
||||
|
||||
try:
|
||||
cursor = self.db_con.cursor()
|
||||
|
||||
progress = terminal.progress.ProgressBar(self.archive_name, len(self.data), fmt=terminal.progress.ProgressBar.FULL)
|
||||
|
||||
for t in self.data:
|
||||
|
||||
n_inserted = self.recursive_insert_db(cursor, t["threads"])
|
||||
# print(" - insert: " + str(n_inserted), end='')
|
||||
if n_inserted > 0:
|
||||
self.db_con.commit()
|
||||
|
||||
progress.current += 1
|
||||
progress()
|
||||
|
||||
progress.done()
|
||||
self.db_con.commit()
|
||||
|
||||
except mariadb.Error as error:
|
||||
pass
|
||||
# print("Error: {}".format(error))
|
||||
finally:
|
||||
cursor.close()
|
||||
|
||||
def recursive_insert_db(self, cursor, thread):
|
||||
|
||||
n_inserted = 0
|
||||
for m in thread:
|
||||
try:
|
||||
|
||||
from_ = archive.util.format_from(m)
|
||||
author_name_ = archive.util.format_author(m)
|
||||
to_ = archive.util.format_to(m)
|
||||
date_ = archive.util.format_date(m, self.archive_name)
|
||||
|
||||
if date_ is None or from_ is None:
|
||||
# print("\nerrorororororo")
|
||||
# print(m['from'] + " -- " + m['date'])
|
||||
continue
|
||||
|
||||
cursor.execute(archive.sql.INSERT, (from_,author_name_,to_,m["subject"],date_,m["content-type"],m["content"],m["url"]))
|
||||
n_inserted += 1
|
||||
|
||||
if "follow-up" in m:
|
||||
n_inserted += self.recursive_insert_db(cursor, m["follow-up"])
|
||||
|
||||
except mariadb.Error as error:
|
||||
if error.errno == 1062:
|
||||
#duplication continue <------------------------- look this up...
|
||||
# print("\nError: {}".format(error))
|
||||
continue
|
||||
|
||||
return n_inserted
|
||||
|
||||
def content_search(self, term, bool=True):
|
||||
|
||||
if self.db_con is None:
|
||||
print("Not connection to database...")
|
||||
return
|
||||
|
||||
try:
|
||||
cursor = self.db_con.cursor(buffered=True)
|
||||
if bool:
|
||||
cursor.execute(archive.sql.CONTENT_QUERY_BOOLEAN.format(self.archive_name, term))
|
||||
else:
|
||||
cursor.execute(archive.sql.CONTENT_QUERY.format(self.archive_name, term))
|
||||
|
||||
# print(cursor.rowcount)
|
||||
results = []
|
||||
for (from_, author_name_, subject_, date_, url_) in cursor:
|
||||
results.append((from_, author_name_, subject_, date_, url_))
|
||||
# print("{} {} {}".format(from_, str(date_), url_))
|
||||
return results
|
||||
|
||||
except mariadb.Error as error:
|
||||
print("Error: {}".format(error))
|
||||
finally:
|
||||
cursor.close()
|
||||
|
||||
def from_search(self, term, bool=True):
|
||||
|
||||
if self.db_con is None:
|
||||
print("Not connection to database...")
|
||||
return
|
||||
|
||||
try:
|
||||
cursor = self.db_con.cursor(buffered=True)
|
||||
if bool:
|
||||
cursor.execute(archive.sql.FROM_QUERY_BOOLEAN.format(self.archive_name, term))
|
||||
else:
|
||||
cursor.execute(archive.sql.FROM_QUERY.format(self.archive_name, term))
|
||||
|
||||
# print(cursor.rowcount)
|
||||
results = []
|
||||
for (from_, author_name_, subject_, date_, url_) in cursor:
|
||||
results.append((from_, author_name_, subject_, date_, url_))
|
||||
# print("{} {} {}".format(from_, str(date_), url_))
|
||||
return results
|
||||
|
||||
except mariadb.Error as error:
|
||||
print("Error: {}".format(error))
|
||||
finally:
|
||||
cursor.close()
|
||||
|
||||
# analysis
|
||||
def longest_field(self, field, thread, max_length=0):
|
||||
import archive.util
|
||||
for m in thread:
|
||||
if not field in m:
|
||||
if "threads" in m:
|
||||
max_length = self.longest_field(field, m["threads"], max_length)
|
||||
continue
|
||||
if m[field] is None:
|
||||
continue
|
||||
if field == "from":
|
||||
m[field] = archive.util.format_from(m)
|
||||
elif field == "author_name":
|
||||
m[field] = archive.util.format_author(m)
|
||||
elif field == "to":
|
||||
m[field] = archive.util.format_to(m)
|
||||
elif field == "date":
|
||||
m[field] = str(archive.util.format_date(m, self.archive_name))
|
||||
|
||||
|
||||
if m[field] is None:
|
||||
continue
|
||||
|
||||
l = len(m[field])
|
||||
if l > max_length:
|
||||
max_length = l
|
||||
print(">> " + m[field])
|
||||
if "follow-up" in m:
|
||||
max_length = self.longest_field(field, m["follow-up"], max_length)
|
||||
return max_length
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,31 @@
|
||||
CREATE = "CREATE TABLE `{}` (" \
|
||||
"`from_` varchar(85) NOT NULL," \
|
||||
"`author_name_` varchar(200) NOT NULL," \
|
||||
"`to_` text(60)," \
|
||||
"`subject_` varchar(3500) NOT NULL," \
|
||||
"`date_` datetime NOT NULL," \
|
||||
"`content_type_` varchar(15) NOT NULL," \
|
||||
"`content_` mediumtext NOT NULL," \
|
||||
"`url_` varchar(100) NOT NULL," \
|
||||
"PRIMARY KEY(`from_`, `date_`)," \
|
||||
"FULLTEXT (`subject_`, `content_`)," \
|
||||
"FULLTEXT (`from_`, `author_name_`)" \
|
||||
") ENGINE = InnoDB;"
|
||||
|
||||
INSERT = ("INSERT INTO nettime_l"
|
||||
"(from_, author_name_, to_, subject_, date_, content_type_, content_, url_) "
|
||||
"VALUES (%s, %s, %s, %s, %s, %s, %s, %s)")
|
||||
|
||||
CONTENT_QUERY_BOOLEAN = ("SELECT from_, author_name_, subject_, date_, url_ FROM {} "
|
||||
"WHERE MATCH(subject_, content_) AGAINST('{}' IN BOOLEAN MODE) ORDER BY date_")
|
||||
|
||||
CONTENT_QUERY_NL = ("SELECT from_, author_name_, subject_, date_, url_ FROM {} "
|
||||
"WHERE MATCH(subject_, content_) AGAINST('{}') ORDER BY date_")
|
||||
|
||||
FROM_QUERY_BOOLEAN = ("SELECT from_, author_name_, subject_, date_, url_ FROM {} "
|
||||
"WHERE MATCH(from_, author_name_) AGAINST('{}' IN BOOLEAN MODE) ORDER BY date_")
|
||||
|
||||
FROM_QUERY_NL = ("SELECT from_, author_name_, subject_, date_, url_ FROM {} "
|
||||
"WHERE MATCH(from_, author_name_) AGAINST('{}') ORDER BY date_")
|
||||
|
||||
# SELECT from_, author_name_, subject_, date_, url_ FROM nettime_l WHERE MATCH(content_) AGAINST('%s' IN BOOLEAN MODE)
|
||||
Executable
+225
@@ -0,0 +1,225 @@
|
||||
import email, datetime, sys
|
||||
import hashlib
|
||||
import dateparser
|
||||
|
||||
def format_content(msg):
|
||||
return msg['content']
|
||||
|
||||
def format_url(msg):
|
||||
return msg['url']
|
||||
|
||||
def format_author(msg):
|
||||
|
||||
if 'author_name' not in msg or msg['author_name'] is None:
|
||||
return None
|
||||
|
||||
author_str = msg['author_name'].replace('"', '')
|
||||
|
||||
if "by way of" in author_str:
|
||||
toks = author_str.split("by way of")
|
||||
if toks[0] == "":
|
||||
author_str = format_from(msg)
|
||||
elif toks[0][-1] == "(":
|
||||
author_str = toks[0][:-1].strip()
|
||||
else:
|
||||
author_str = toks[0]
|
||||
|
||||
if ("(" in author_str) or ("<" in author_str):
|
||||
# ex. zx {AT} xyz.net (Michel Foucault) OR Michel Foucault (c'estcommeca.com) OR Michel Foucault <zx {AT} xyz.net>
|
||||
# print("±±±±±±")
|
||||
# print("name: " + author_str)
|
||||
# print("from: " + msg['from'])
|
||||
if not '@' in author_str.lower().replace('{at}', '@').replace(' at ', '@'):
|
||||
author_str = author_str.split('(')[0].strip()
|
||||
else:
|
||||
author_str = email.utils.parseaddr(author_str)[0]
|
||||
# print(" Name:" + author_str.replace('"', ''))
|
||||
# print(" From:" + format_from(msg))
|
||||
|
||||
if " ," in author_str:
|
||||
# nettime's_roving_reporter , thing.net {AT} bbs.thing.net
|
||||
author_str = author_str.split(' ,')[0]
|
||||
|
||||
|
||||
return author_str
|
||||
|
||||
def format_from_token(from_str, sep):
|
||||
from_addr = email.utils.parseaddr(from_str)[1]
|
||||
if sep not in from_addr:
|
||||
tok = from_str.split()
|
||||
try:
|
||||
at = tok.index(sep)
|
||||
from_addr = ''.join([tok[at-1], '{AT}', tok[at+1]])
|
||||
if from_addr.startswith('<') or from_addr.endswith('>'):
|
||||
from_addr = from_addr.strip('<').strip('>')
|
||||
except ValueError:
|
||||
print(tok)
|
||||
print("error formating 'from' " + from_str + " -- expecting sep: " + sep)
|
||||
return None
|
||||
else:
|
||||
from_addr = from_addr.replace(sep, '{AT}')
|
||||
return "".join(from_addr.lower().split())
|
||||
|
||||
def format_from(msg):
|
||||
|
||||
if 'from' not in msg or msg['from'] is None:
|
||||
return None
|
||||
|
||||
from_str = msg['from']
|
||||
|
||||
if " {AT} " in from_str:
|
||||
return format_from_token(from_str, '{AT}')
|
||||
elif " at " in from_str:
|
||||
return format_from_token(from_str, 'at')
|
||||
elif "@" in from_str:
|
||||
return format_from_token(from_str, '@')
|
||||
else:
|
||||
return "".join(from_str.split())
|
||||
|
||||
def format_to(msg):
|
||||
|
||||
if "to" not in msg or msg["to"] is None:
|
||||
return None
|
||||
|
||||
to_str = msg["to"]
|
||||
toks = email.utils.parseaddr(to_str)
|
||||
# print(toks)
|
||||
|
||||
if len(toks) == 2:
|
||||
to_str = toks[1]
|
||||
|
||||
return "".join(to_str.lower().split())
|
||||
|
||||
|
||||
# returns utc timestamp --- old...
|
||||
def format_date_utc(msg, archive_name):
|
||||
|
||||
if 'date' not in msg or msg['date'] is None:
|
||||
return None
|
||||
|
||||
date_str = msg['date'].replace('.', '')
|
||||
time_tz = None
|
||||
try:
|
||||
date_tz = email.utils.parsedate_tz(date_str)
|
||||
time_tz = email.utils.mktime_tz(date_tz) #utc timestamp
|
||||
except TypeError:
|
||||
print("Format Date TypeError")
|
||||
print(" > " + date_str)
|
||||
return None
|
||||
except ValueError:
|
||||
print("Format Date ValueError")
|
||||
print(" > " + date_str)
|
||||
return None
|
||||
finally:
|
||||
return time_tz
|
||||
|
||||
def format_date(msg, archive_name):
|
||||
|
||||
if 'date' not in msg or msg['date'] is None:
|
||||
return None
|
||||
|
||||
# date_str = msg['date'].replace('.', '')
|
||||
date_str = msg['date']
|
||||
|
||||
# fix Thu, 01 Aug 2002 17:33:08 +0900 (JST)
|
||||
if '(' in date_str:
|
||||
date_str = date_str.split('(')[0].rstrip()
|
||||
|
||||
|
||||
date_time = dateparser.parse(date_str)
|
||||
if date_time is None:
|
||||
|
||||
# random stuff...
|
||||
fix = False
|
||||
toks = date_str.split()
|
||||
|
||||
if len(toks[-1]) == 5 or len(toks[-1]) == 4:
|
||||
# ex. Thu, 24 Jan 2002 15:21:31 -0000
|
||||
if toks[-1] in ['+0000', '-0000', '0000']:
|
||||
date_str = date_str[:-5]
|
||||
fix = True
|
||||
# ex. Fri, 25 Jan 2002 13:21:49 +1050
|
||||
elif toks[-1][-2] == '5':
|
||||
d = list(date_str)
|
||||
d[-2] = '3'
|
||||
date_str = "".join(d)
|
||||
fix = True
|
||||
|
||||
if toks[-1][-1] != '0':
|
||||
#ex. 'Fri,', '20', 'Jun', '1997', '02:58:59', '-0005'
|
||||
date_str = date_str[:-5]
|
||||
fix = True
|
||||
|
||||
if 'Fru' in toks[0]:
|
||||
date_str = date_str.replace('Fru', 'Fri')
|
||||
fix = True
|
||||
elif 'Thur' in toks[0]:
|
||||
date_str = date_str.replace('Thur', 'Thu')
|
||||
fix = True
|
||||
|
||||
if not fix:
|
||||
# print("----")
|
||||
return None
|
||||
|
||||
date_time = dateparser.parse(date_str)
|
||||
if date_time is None:
|
||||
|
||||
if 'GMT' in date_str:
|
||||
# ex. 'Mon,', '15', 'Jan', '96', '02:55', 'GMT+0100'
|
||||
date_str = date_str.split('GMT')[0].rstrip()
|
||||
fix = True
|
||||
|
||||
if 'METDST' in toks[-1]:
|
||||
# ex. 'Sat,', '3', 'May', '97', '21:07', 'METDST'
|
||||
date_str = date_str.replace('METDST', 'MET')
|
||||
fix = True
|
||||
|
||||
|
||||
if not fix:
|
||||
# print("++++")
|
||||
return None
|
||||
|
||||
date_time = dateparser.parse(date_str)
|
||||
return date_time
|
||||
|
||||
# else:
|
||||
# print(date_str)
|
||||
|
||||
# date_time = datetime.datetime.fromtimestamp(time_tz)
|
||||
|
||||
min_d = datetime.datetime.strptime(min_date(archive_name), "%d/%m/%Y")
|
||||
max_d = datetime.datetime.now()
|
||||
|
||||
date_time_naive = date_time.replace(tzinfo=None)
|
||||
|
||||
if date_time_naive < min_d or date_time_naive > max_d:
|
||||
return None
|
||||
|
||||
return date_time
|
||||
|
||||
def format_subject(msg, archive_name):
|
||||
|
||||
if 'subject' not in msg or msg['subject'] is None:
|
||||
return None
|
||||
|
||||
return msg['subject']
|
||||
|
||||
def format_id(msg, archive_name):
|
||||
if "message-id" in msg:
|
||||
return msg['message-id']
|
||||
else:
|
||||
# create hash with author_name + date
|
||||
s = msg['author_name'] + msg['date']
|
||||
sha = hashlib.sha1(s.encode('utf-8'))
|
||||
return sha.hexdigest()
|
||||
|
||||
# format='%d/%m/%Y'
|
||||
def min_date(archive_name):
|
||||
if "nettime" in archive_name:
|
||||
return '01/10/1995'
|
||||
elif archive_name == "spectre":
|
||||
return '01/08/2001'
|
||||
elif archive_name == "empyre":
|
||||
return '01/01/2002'
|
||||
elif archive_name == "crumb":
|
||||
return '01/02/2001'
|
||||
Reference in New Issue
Block a user