archiving

This commit is contained in:
gauthiier 2019-12-22 14:40:23 +01:00
parent e3641ec1ad
commit cb93e046af
5 changed files with 52 additions and 34 deletions

View File

@ -1,5 +1,5 @@
import email, email.parser
import os, json, gzip, re
import os, json, gzip, re, logging
import mysql.connector as mariadb
import archive.sql, archive.util
from datetime import date, datetime
@ -51,9 +51,9 @@ def connect_db(database, host, user, password):
try:
con = mariadb.connect(host=host, user=user, password=password, database=database)
except mariadb.Error as error:
print("Error: {}".format(error))
logging.error("Error: {}".format(error))
if error.errno == 1049:
print("Database " + database + " does not exist.")
logging.error("Database " + database + " does not exist.")
return None
finally:
return con
@ -73,15 +73,13 @@ def list_tables_db(database, host, user, password):
return results
except mariadb.Error as error:
print("Error: {}".format(error))
logging.error("Error: {}".format(error))
finally:
cursor.close()
con.close()
def list_tables_db_config(config):
return list_tables_db(config['database'], config['host'], config['user'], config['password'])
class Archive:
@ -93,10 +91,10 @@ class Archive:
# this is twisted................ two constructors... dumb
if isinstance(config, str):
# need a filename or a dir name....
print("reading archive " + archive_name, end='')
logging.info("reading archive " + archive_name)
archive_dir = config
(self.data, self.archive_name) = load_from_file(archive_name, archive_name, archive_dir)
print(" - done.")
logging.info(" - done.")
elif isinstance(config, dict):
self.archive_name = archive_name
self.db_con = connect_db(config['database'], config['host'], config['user'], config['password'])
@ -116,24 +114,24 @@ class Archive:
def create_db(self, config=None):
print("creating table: " + self.archive_name, end='')
logging.info("creating table: " + self.archive_name)
if self.db_con is None:
if config is not None:
self.db_con = connect_db(config['database'], config['host'], config['user'], config['password'])
if self.db_con is None:
print(" - no connection... Aborting.")
logging.warning(" - no connection... Aborting.")
return
try:
cursor = self.db_con.cursor()
cursor.execute(archive.sql.CREATE.format(self.archive_name))
except mariadb.Error as error:
print("Error: {}".format(error))
logging.error("Error: {}".format(error))
finally:
cursor.close()
print(" - done.")
logging.info(" - done.")
def insert_db(self, config=None):
@ -153,7 +151,7 @@ class Archive:
for t in self.data:
n_inserted = self.recursive_insert_db(cursor, t["threads"])
# print(" - insert: " + str(n_inserted), end='')
logging.info(" - " + str(n_inserted))
if n_inserted > 0:
self.db_con.commit()
@ -164,7 +162,7 @@ class Archive:
self.db_con.commit()
except mariadb.Error as error:
print("Error: {}".format(error))
logging.error("Error: {}".format(error))
pass
finally:
cursor.close()
@ -175,13 +173,15 @@ class Archive:
for m in thread:
try:
logging.info(" - in - " + m['date'] + " " + m['subject'])
from_ = archive.util.format_from(m)
author_name_ = archive.util.format_author(m)
to_ = archive.util.format_to(m)
date_ = archive.util.format_date(m, self.archive_name)
if date_ is None or from_ is None:
print("\nDATE ERROR: " + m['from'] + " - " + m['date'])
logging.warning("\nDATE ERROR: " + m['from'] + " - " + m['date'])
continue
@ -197,10 +197,11 @@ class Archive:
if error.errno == 1062:
#duplication continue <------------------------- look this up...
# print("\nError: {}".format(error))
logging.info("+++db+++ duplicate")
continue
else:
print("\nError: {}".format(error))
print(str_insert)
logging.error("\nError: {}".format(error))
logging.error(str_insert)
continue
return n_inserted
@ -208,7 +209,7 @@ class Archive:
def content_search(self, term, bool=True):
if self.db_con is None:
print("Not connection to database...")
logging.warning("Not connection to database...")
return
try:
@ -226,14 +227,14 @@ class Archive:
return results
except mariadb.Error as error:
print("Error: {}".format(error))
logging.error("Error: {}".format(error))
finally:
cursor.close()
def from_search(self, term, bool=True):
if self.db_con is None:
print("Not connection to database...")
logging.warning("Not connection to database...")
return
try:
@ -251,7 +252,7 @@ class Archive:
return results
except mariadb.Error as error:
print("Error: {}".format(error))
logging.erro("Error: {}".format(error))
finally:
cursor.close()

View File

@ -12,6 +12,9 @@ CREATE = "CREATE TABLE `{}` (" \
"FULLTEXT (`from_`, `author_name_`)" \
") ENGINE = InnoDB;"
# FULLTEXT manual
# ALTER TABLE tableName ADD FULLTEXT(columnA, columnB);
INSERT = ("INSERT INTO {}"
"(from_, author_name_, to_, subject_, date_, content_type_, content_, url_) "
"VALUES (%s, %s, %s, %s, %s, %s, %s, %s)")

View File

@ -215,7 +215,7 @@ def format_id(msg, archive_name):
# format='%d/%m/%Y'
def min_date(archive_name):
if "nettime" in archive_name:
if archive_name == "nettime_l":
return '01/10/1995'
elif archive_name == "spectre":
return '01/08/2001'
@ -223,3 +223,8 @@ def min_date(archive_name):
return '01/01/2002'
elif archive_name == "crumb":
return '01/02/2001'
elif archive_name == "oldboys":
return '01/03/2001'
elif archive_name == "nettime_bold":
return '01/01/2000'

View File

@ -2,6 +2,7 @@ import os, logging, argparse
from glob import glob
import archive.archive as archive
import config
import terminal.util as util
logging.basicConfig(level=logging.DEBUG)
@ -10,25 +11,32 @@ def list_archives(archives_dir):
def run(lists, archives):
logging.debug("indexing: " + str(lists) + " from " + archives)
lists_db = archive.list_tables_db_config(config.db)
for a in lists:
ar = archive.Archive(a, archives)
if a not in lists_db:
if util.y_n_question("Archive " + a + " db table does not exist. Create it?"):
ar.create_db(config.db)
else:
logging.info("Table not created. Aborting.")
return
ar.insert_db(config.db)
if __name__ == "__main__":
p = argparse.ArgumentParser(description='Mailinglists are dead. Long live mailinglists!')
p.add_argument('list', metavar="list", help="list(s) to index", nargs="+")
p.add_argument('--archives', '-a', help="path to archives directory (default='archives')", default=config.archives)
p = argparse.ArgumentParser(description='Mailinglists are dead. Long live mailinglists!')
p.add_argument('list', metavar="list", help="list(s) to index", nargs="+")
p.add_argument('--archives', '-a', help="path to archives directory (default='archives')", default=config.archives)
args = p.parse_args()
args = p.parse_args()
if not args.archives:
args.archives = config.archives
if not args.archives:
args.archives = config.archives
if len(args.list) == 1 and args.list[0] == "all":
args.list = list_archives(args.archives)
if len(args.list) == 1 and args.list[0] == "all":
args.list = list_archives(args.archives)
run(args.list, args.archives)
run(args.list, args.archives)

View File

@ -1,3 +1,4 @@
import sys
def y_n_question(question_str):