diff --git a/random/delete_from_archive.py b/random/delete_from_archive.py new file mode 100644 index 0000000..67eb8e6 --- /dev/null +++ b/random/delete_from_archive.py @@ -0,0 +1,115 @@ +import os, json, logging, re +# import config + +logging.basicConfig(level=logging.DEBUG) + +def list_archives(archives_dir): + return [d for d in os.listdir(archives_dir) if os.path.isdir(os.path.join(archives_dir, d))] + +def list_archive_files(archive_dir): + return [os.path.join(archive_dir, f) for f in os.listdir(archive_dir) if os.path.isfile(os.path.join(archive_dir, f)) and f.endswith('.json')] + +def delete(archive_path, black_list_names): + # the archive_path should be a single js file + if not os.path.isfile(archive_path): + logging.error(archive_path + " is not a file. Aborting") + return + + logging.info("Deleting " + str(black_list_names)) + with open(archive_path, 'r') as fp: + d = json.load(fp) + + logging.info(d['name'] + " " + archive_path) + logging.info(" - nbr. threads before deletion: " + str(len(d['threads']))) + for m in d['threads']: + if m['author_name'] in black_list_names: + d['threads'].remove(m) + logging.info(" - nbr. threads after deletion: " + str(len(d['threads']))) + logging.info("writing to " + archive_path) + + with open(archive_path, 'w') as fp: + json.dump(d, fp, indent=4) + + logging.info("done.") + +def delete_random_spam(archive_path): + + spam_signals = ["Unable to process data: multipart/mixed", "Bulk Email Web Hosting!", "WORK FROM YOUR OWN HOME", "paul_ugo1@yahoo.com", "Advertise Your Business"] + + if not os.path.isfile(archive_path): + logging.error(archive_path + " is not a file. Aborting") + return + + logging.info("Deleting spamz") + with open(archive_path, 'r') as fp: + d = json.load(fp) + + logging.info(d['name'] + " " + archive_path) + logging.info(" ++ nbr. threads before deletion: " + str(len(d['threads']))) + for m in d['threads']: + for s in spam_signals: + if s.lower() in m['content'].lower(): + logging.info(" - " + m['date'] + " - " + m['subject']) + d['threads'].remove(m) + break + if m['content'].lower().startswith("title: "): + logging.info(" - " + m['date'] + " - " + m['subject']) + d['threads'].remove(m) + break + if m['content'].lower().startswith("attachment: "): + logging.info(" - " + m['date'] + " - " + m['subject']) + d['threads'].remove(m) + break + + if m['author_name'] and re.match(r'[A-Za-z0-9 _.,!"\|\\\/\-\+\'\[\]\{\}]', m['author_name']) is None: + logging.info(" - " + m['author_name']) + d['threads'].remove(m) + break + + logging.info(" ++ nbr. threads after deletion: " + str(len(d['threads']))) + logging.info("writing to " + archive_path) + + with open(archive_path, 'w') as fp: + json.dump(d, fp, indent=4) + + logging.info("done.") + + + +def delete_from_archive(archive, archive_dir, black_list_names=None): + + archives = list_archives(archive_dir) + + if archive not in archives: + logging.error(archive + " does not exist. Aborting") + return + + archive_dir = os.path.join(archive_dir, archive) + if not os.path.isdir(archive_dir): + logging.error(archive_dir + " does not exist. Aborting") + return + + files = list_archive_files(archive_dir) + if not files: + logging.error("No files in " + archive_dir + ". Aborting") + return + + for f in files: + if black_list_names is not None: + delete(f, black_list_names) + else: + delete_random_spam(f) + + + +if __name__ == "__main__": + + ## no refactor yet... cleaning nettime-bold manually... + + # black = ["MRE EKI OMORODION", "EKI OMORODION", "orellanaweb", "KENNETH OSHODI"] + + # white = ["Lachlan Brown", "brad brace", "Alan Sondheim", "David Goldschmidt", "Brian Holmes", "][co][De][e][p.rivation", "Francis Hwang", "Florian Cramer", "- G a r r e t t -"] + + delete_from_archive('nettime_bold', "../MAILINGLISTSARCHIVES") + +