import email, email.parser import os, json, gzip, re import mysql.connector as mariadb import archive.sql, archive.util from datetime import date, datetime import terminal.progress def load_from_file(filename, archive_name, archive_dir): if not filename.endswith('.json.gz'): file_path = os.path.join(archive_dir, filename + '.json.gz') else: file_path = os.path.join(archive_dir, filename) if os.path.isfile(file_path): with gzip.open(file_path, 'r') as fp: json_data = json.load(fp) return (json_data, archive_name) else: #list of all "filename[...].json.gz" in archive_dir files = sorted([f for f in os.listdir(archive_dir) if os.path.isfile(os.path.join(archive_dir, f)) and f.startswith(filename) and f.endswith('.json.gz')]) if files: filename = files[-1] # take the most recent (listed alpha-chronological) file_path = os.path.join(archive_dir, filename) if os.path.isfile(file_path): with gzip.open(file_path, 'r') as fp: json_data = json.load(fp) return (json_data, archive_name) # <--- this makes no sense.... else: #list of all json files in archive_dir/filename dir_path = os.path.join(archive_dir, filename) if not os.path.isdir(dir_path): return None files = [os.path.join(dir_path, f) for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f)) and f.endswith('.json')] if not files: return None # load all json files threads = [] for file_path in files: with open(file_path, 'r') as fp: json_data = json.load(fp) threads.append(json_data) return (threads, archive_name) def connect_db(database, host, user, password): try: con = mariadb.connect(host=host, user=user, password=password, database=database) except mariadb.Error as error: print("Error: {}".format(error)) if error.errno == 1049: if util.y_n_question("Table " + archive_name + " does not exist. Create it?"): print("creating") else: print("not creating") return None finally: return con class Archive: data = None # "raw" json data db_con = None def __init__(self, archive_name, archive_dir): if isinstance(archive_name, str): # need a filename or a dir name.... print("reading archive " + archive_name, end='') (self.data, self.archive_name) = load_from_file(archive_name, archive_name, archive_dir) print(" - done.") def __init__(self, archive_name, database, host, user, password): self.archive_name = archive_name self.db_con = connect_db(database, host, user, password) def __init__(self, archive_name, config): self.archive_name = archive_name self.db_con = connect_db(config['database'], config['host'], config['user'], config['password']) def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): if self.db_con is not None: self.db_con.close() def create_db(self, host, database, user, password): print("creating table: " + self.archive_name, end='') self.db_con = connect_db(database, host, user, password) if self.db_con is None: return try: cursor = self.db_con.cursor() cursor.execute(archive.sql.CREATE.format(self.archive_name)) except mariadb.Error as error: print("Error: {}".format(error)) finally: cursor.close() print(" - done.") def insert_db(self, host, database, user, password): self.db_con = connect_db(database, host, user, password) if self.db_con is None: return try: cursor = self.db_con.cursor() progress = terminal.progress.ProgressBar(self.archive_name, len(self.data), fmt=terminal.progress.ProgressBar.FULL) for t in self.data: n_inserted = self.recursive_insert_db(cursor, t["threads"]) # print(" - insert: " + str(n_inserted), end='') if n_inserted > 0: self.db_con.commit() progress.current += 1 progress() progress.done() self.db_con.commit() except mariadb.Error as error: pass # print("Error: {}".format(error)) finally: cursor.close() def recursive_insert_db(self, cursor, thread): n_inserted = 0 for m in thread: try: from_ = archive.util.format_from(m) author_name_ = archive.util.format_author(m) to_ = archive.util.format_to(m) date_ = archive.util.format_date(m, self.archive_name) if date_ is None or from_ is None: # print("\nerrorororororo") # print(m['from'] + " -- " + m['date']) continue cursor.execute(archive.sql.INSERT, (from_,author_name_,to_,m["subject"],date_,m["content-type"],m["content"],m["url"])) n_inserted += 1 if "follow-up" in m: n_inserted += self.recursive_insert_db(cursor, m["follow-up"]) except mariadb.Error as error: if error.errno == 1062: #duplication continue <------------------------- look this up... # print("\nError: {}".format(error)) continue return n_inserted def content_search(self, term, bool=True): if self.db_con is None: print("Not connection to database...") return try: cursor = self.db_con.cursor(buffered=True) if bool: cursor.execute(archive.sql.CONTENT_QUERY_BOOLEAN.format(self.archive_name, term)) else: cursor.execute(archive.sql.CONTENT_QUERY.format(self.archive_name, term)) # print(cursor.rowcount) results = [] for (from_, author_name_, subject_, date_, url_) in cursor: results.append((from_, author_name_, subject_, date_, url_)) # print("{} {} {}".format(from_, str(date_), url_)) return results except mariadb.Error as error: print("Error: {}".format(error)) finally: cursor.close() def from_search(self, term, bool=True): if self.db_con is None: print("Not connection to database...") return try: cursor = self.db_con.cursor(buffered=True) if bool: cursor.execute(archive.sql.FROM_QUERY_BOOLEAN.format(self.archive_name, term)) else: cursor.execute(archive.sql.FROM_QUERY.format(self.archive_name, term)) # print(cursor.rowcount) results = [] for (from_, author_name_, subject_, date_, url_) in cursor: results.append((from_, author_name_, subject_, date_, url_)) # print("{} {} {}".format(from_, str(date_), url_)) return results except mariadb.Error as error: print("Error: {}".format(error)) finally: cursor.close() # analysis def longest_field(self, field, thread, max_length=0): import archive.util for m in thread: if not field in m: if "threads" in m: max_length = self.longest_field(field, m["threads"], max_length) continue if m[field] is None: continue if field == "from": m[field] = archive.util.format_from(m) elif field == "author_name": m[field] = archive.util.format_author(m) elif field == "to": m[field] = archive.util.format_to(m) elif field == "date": m[field] = str(archive.util.format_date(m, self.archive_name)) if m[field] is None: continue l = len(m[field]) if l > max_length: max_length = l print(">> " + m[field]) if "follow-up" in m: max_length = self.longest_field(field, m["follow-up"], max_length) return max_length