archiving

This commit is contained in:
gauthiier 2019-12-22 14:40:23 +01:00
parent e3641ec1ad
commit cb93e046af
5 changed files with 52 additions and 34 deletions

View File

@ -1,5 +1,5 @@
import email, email.parser import email, email.parser
import os, json, gzip, re import os, json, gzip, re, logging
import mysql.connector as mariadb import mysql.connector as mariadb
import archive.sql, archive.util import archive.sql, archive.util
from datetime import date, datetime from datetime import date, datetime
@ -51,9 +51,9 @@ def connect_db(database, host, user, password):
try: try:
con = mariadb.connect(host=host, user=user, password=password, database=database) con = mariadb.connect(host=host, user=user, password=password, database=database)
except mariadb.Error as error: except mariadb.Error as error:
print("Error: {}".format(error)) logging.error("Error: {}".format(error))
if error.errno == 1049: if error.errno == 1049:
print("Database " + database + " does not exist.") logging.error("Database " + database + " does not exist.")
return None return None
finally: finally:
return con return con
@ -73,15 +73,13 @@ def list_tables_db(database, host, user, password):
return results return results
except mariadb.Error as error: except mariadb.Error as error:
print("Error: {}".format(error)) logging.error("Error: {}".format(error))
finally: finally:
cursor.close() cursor.close()
con.close() con.close()
def list_tables_db_config(config):
return list_tables_db(config['database'], config['host'], config['user'], config['password'])
class Archive: class Archive:
@ -93,10 +91,10 @@ class Archive:
# this is twisted................ two constructors... dumb # this is twisted................ two constructors... dumb
if isinstance(config, str): if isinstance(config, str):
# need a filename or a dir name.... # need a filename or a dir name....
print("reading archive " + archive_name, end='') logging.info("reading archive " + archive_name)
archive_dir = config archive_dir = config
(self.data, self.archive_name) = load_from_file(archive_name, archive_name, archive_dir) (self.data, self.archive_name) = load_from_file(archive_name, archive_name, archive_dir)
print(" - done.") logging.info(" - done.")
elif isinstance(config, dict): elif isinstance(config, dict):
self.archive_name = archive_name self.archive_name = archive_name
self.db_con = connect_db(config['database'], config['host'], config['user'], config['password']) self.db_con = connect_db(config['database'], config['host'], config['user'], config['password'])
@ -116,24 +114,24 @@ class Archive:
def create_db(self, config=None): def create_db(self, config=None):
print("creating table: " + self.archive_name, end='') logging.info("creating table: " + self.archive_name)
if self.db_con is None: if self.db_con is None:
if config is not None: if config is not None:
self.db_con = connect_db(config['database'], config['host'], config['user'], config['password']) self.db_con = connect_db(config['database'], config['host'], config['user'], config['password'])
if self.db_con is None: if self.db_con is None:
print(" - no connection... Aborting.") logging.warning(" - no connection... Aborting.")
return return
try: try:
cursor = self.db_con.cursor() cursor = self.db_con.cursor()
cursor.execute(archive.sql.CREATE.format(self.archive_name)) cursor.execute(archive.sql.CREATE.format(self.archive_name))
except mariadb.Error as error: except mariadb.Error as error:
print("Error: {}".format(error)) logging.error("Error: {}".format(error))
finally: finally:
cursor.close() cursor.close()
print(" - done.") logging.info(" - done.")
def insert_db(self, config=None): def insert_db(self, config=None):
@ -153,7 +151,7 @@ class Archive:
for t in self.data: for t in self.data:
n_inserted = self.recursive_insert_db(cursor, t["threads"]) n_inserted = self.recursive_insert_db(cursor, t["threads"])
# print(" - insert: " + str(n_inserted), end='') logging.info(" - " + str(n_inserted))
if n_inserted > 0: if n_inserted > 0:
self.db_con.commit() self.db_con.commit()
@ -164,7 +162,7 @@ class Archive:
self.db_con.commit() self.db_con.commit()
except mariadb.Error as error: except mariadb.Error as error:
print("Error: {}".format(error)) logging.error("Error: {}".format(error))
pass pass
finally: finally:
cursor.close() cursor.close()
@ -175,13 +173,15 @@ class Archive:
for m in thread: for m in thread:
try: try:
logging.info(" - in - " + m['date'] + " " + m['subject'])
from_ = archive.util.format_from(m) from_ = archive.util.format_from(m)
author_name_ = archive.util.format_author(m) author_name_ = archive.util.format_author(m)
to_ = archive.util.format_to(m) to_ = archive.util.format_to(m)
date_ = archive.util.format_date(m, self.archive_name) date_ = archive.util.format_date(m, self.archive_name)
if date_ is None or from_ is None: if date_ is None or from_ is None:
print("\nDATE ERROR: " + m['from'] + " - " + m['date']) logging.warning("\nDATE ERROR: " + m['from'] + " - " + m['date'])
continue continue
@ -197,10 +197,11 @@ class Archive:
if error.errno == 1062: if error.errno == 1062:
#duplication continue <------------------------- look this up... #duplication continue <------------------------- look this up...
# print("\nError: {}".format(error)) # print("\nError: {}".format(error))
logging.info("+++db+++ duplicate")
continue continue
else: else:
print("\nError: {}".format(error)) logging.error("\nError: {}".format(error))
print(str_insert) logging.error(str_insert)
continue continue
return n_inserted return n_inserted
@ -208,7 +209,7 @@ class Archive:
def content_search(self, term, bool=True): def content_search(self, term, bool=True):
if self.db_con is None: if self.db_con is None:
print("Not connection to database...") logging.warning("Not connection to database...")
return return
try: try:
@ -226,14 +227,14 @@ class Archive:
return results return results
except mariadb.Error as error: except mariadb.Error as error:
print("Error: {}".format(error)) logging.error("Error: {}".format(error))
finally: finally:
cursor.close() cursor.close()
def from_search(self, term, bool=True): def from_search(self, term, bool=True):
if self.db_con is None: if self.db_con is None:
print("Not connection to database...") logging.warning("Not connection to database...")
return return
try: try:
@ -251,7 +252,7 @@ class Archive:
return results return results
except mariadb.Error as error: except mariadb.Error as error:
print("Error: {}".format(error)) logging.erro("Error: {}".format(error))
finally: finally:
cursor.close() cursor.close()

View File

@ -12,6 +12,9 @@ CREATE = "CREATE TABLE `{}` (" \
"FULLTEXT (`from_`, `author_name_`)" \ "FULLTEXT (`from_`, `author_name_`)" \
") ENGINE = InnoDB;" ") ENGINE = InnoDB;"
# FULLTEXT manual
# ALTER TABLE tableName ADD FULLTEXT(columnA, columnB);
INSERT = ("INSERT INTO {}" INSERT = ("INSERT INTO {}"
"(from_, author_name_, to_, subject_, date_, content_type_, content_, url_) " "(from_, author_name_, to_, subject_, date_, content_type_, content_, url_) "
"VALUES (%s, %s, %s, %s, %s, %s, %s, %s)") "VALUES (%s, %s, %s, %s, %s, %s, %s, %s)")

View File

@ -215,7 +215,7 @@ def format_id(msg, archive_name):
# format='%d/%m/%Y' # format='%d/%m/%Y'
def min_date(archive_name): def min_date(archive_name):
if "nettime" in archive_name: if archive_name == "nettime_l":
return '01/10/1995' return '01/10/1995'
elif archive_name == "spectre": elif archive_name == "spectre":
return '01/08/2001' return '01/08/2001'
@ -223,3 +223,8 @@ def min_date(archive_name):
return '01/01/2002' return '01/01/2002'
elif archive_name == "crumb": elif archive_name == "crumb":
return '01/02/2001' return '01/02/2001'
elif archive_name == "oldboys":
return '01/03/2001'
elif archive_name == "nettime_bold":
return '01/01/2000'

View File

@ -2,6 +2,7 @@ import os, logging, argparse
from glob import glob from glob import glob
import archive.archive as archive import archive.archive as archive
import config import config
import terminal.util as util
logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.DEBUG)
@ -10,9 +11,16 @@ def list_archives(archives_dir):
def run(lists, archives): def run(lists, archives):
logging.debug("indexing: " + str(lists) + " from " + archives) logging.debug("indexing: " + str(lists) + " from " + archives)
lists_db = archive.list_tables_db_config(config.db)
for a in lists: for a in lists:
ar = archive.Archive(a, archives) ar = archive.Archive(a, archives)
if a not in lists_db:
if util.y_n_question("Archive " + a + " db table does not exist. Create it?"):
ar.create_db(config.db)
else:
logging.info("Table not created. Aborting.")
return
ar.insert_db(config.db) ar.insert_db(config.db)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -1,3 +1,4 @@
import sys
def y_n_question(question_str): def y_n_question(question_str):