random fix script
This commit is contained in:
parent
a8cfaee935
commit
e3641ec1ad
115
random/delete_from_archive.py
Normal file
115
random/delete_from_archive.py
Normal file
@ -0,0 +1,115 @@
|
||||
import os, json, logging, re
|
||||
# import config
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
def list_archives(archives_dir):
|
||||
return [d for d in os.listdir(archives_dir) if os.path.isdir(os.path.join(archives_dir, d))]
|
||||
|
||||
def list_archive_files(archive_dir):
|
||||
return [os.path.join(archive_dir, f) for f in os.listdir(archive_dir) if os.path.isfile(os.path.join(archive_dir, f)) and f.endswith('.json')]
|
||||
|
||||
def delete(archive_path, black_list_names):
|
||||
# the archive_path should be a single js file
|
||||
if not os.path.isfile(archive_path):
|
||||
logging.error(archive_path + " is not a file. Aborting")
|
||||
return
|
||||
|
||||
logging.info("Deleting " + str(black_list_names))
|
||||
with open(archive_path, 'r') as fp:
|
||||
d = json.load(fp)
|
||||
|
||||
logging.info(d['name'] + " " + archive_path)
|
||||
logging.info(" - nbr. threads before deletion: " + str(len(d['threads'])))
|
||||
for m in d['threads']:
|
||||
if m['author_name'] in black_list_names:
|
||||
d['threads'].remove(m)
|
||||
logging.info(" - nbr. threads after deletion: " + str(len(d['threads'])))
|
||||
logging.info("writing to " + archive_path)
|
||||
|
||||
with open(archive_path, 'w') as fp:
|
||||
json.dump(d, fp, indent=4)
|
||||
|
||||
logging.info("done.")
|
||||
|
||||
def delete_random_spam(archive_path):
|
||||
|
||||
spam_signals = ["Unable to process data: multipart/mixed", "Bulk Email Web Hosting!", "WORK FROM YOUR OWN HOME", "paul_ugo1@yahoo.com", "Advertise Your Business"]
|
||||
|
||||
if not os.path.isfile(archive_path):
|
||||
logging.error(archive_path + " is not a file. Aborting")
|
||||
return
|
||||
|
||||
logging.info("Deleting spamz")
|
||||
with open(archive_path, 'r') as fp:
|
||||
d = json.load(fp)
|
||||
|
||||
logging.info(d['name'] + " " + archive_path)
|
||||
logging.info(" ++ nbr. threads before deletion: " + str(len(d['threads'])))
|
||||
for m in d['threads']:
|
||||
for s in spam_signals:
|
||||
if s.lower() in m['content'].lower():
|
||||
logging.info(" - " + m['date'] + " - " + m['subject'])
|
||||
d['threads'].remove(m)
|
||||
break
|
||||
if m['content'].lower().startswith("title: "):
|
||||
logging.info(" - " + m['date'] + " - " + m['subject'])
|
||||
d['threads'].remove(m)
|
||||
break
|
||||
if m['content'].lower().startswith("attachment: "):
|
||||
logging.info(" - " + m['date'] + " - " + m['subject'])
|
||||
d['threads'].remove(m)
|
||||
break
|
||||
|
||||
if m['author_name'] and re.match(r'[A-Za-z0-9 _.,!"\|\\\/\-\+\'\[\]\{\}]', m['author_name']) is None:
|
||||
logging.info(" - " + m['author_name'])
|
||||
d['threads'].remove(m)
|
||||
break
|
||||
|
||||
logging.info(" ++ nbr. threads after deletion: " + str(len(d['threads'])))
|
||||
logging.info("writing to " + archive_path)
|
||||
|
||||
with open(archive_path, 'w') as fp:
|
||||
json.dump(d, fp, indent=4)
|
||||
|
||||
logging.info("done.")
|
||||
|
||||
|
||||
|
||||
def delete_from_archive(archive, archive_dir, black_list_names=None):
|
||||
|
||||
archives = list_archives(archive_dir)
|
||||
|
||||
if archive not in archives:
|
||||
logging.error(archive + " does not exist. Aborting")
|
||||
return
|
||||
|
||||
archive_dir = os.path.join(archive_dir, archive)
|
||||
if not os.path.isdir(archive_dir):
|
||||
logging.error(archive_dir + " does not exist. Aborting")
|
||||
return
|
||||
|
||||
files = list_archive_files(archive_dir)
|
||||
if not files:
|
||||
logging.error("No files in " + archive_dir + ". Aborting")
|
||||
return
|
||||
|
||||
for f in files:
|
||||
if black_list_names is not None:
|
||||
delete(f, black_list_names)
|
||||
else:
|
||||
delete_random_spam(f)
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
## no refactor yet... cleaning nettime-bold manually...
|
||||
|
||||
# black = ["MRE EKI OMORODION", "EKI OMORODION", "orellanaweb", "KENNETH OSHODI"]
|
||||
|
||||
# white = ["Lachlan Brown", "brad brace", "Alan Sondheim", "David Goldschmidt", "Brian Holmes", "][co][De][e][p.rivation", "Francis Hwang", "Florian Cramer", "- G a r r e t t -"]
|
||||
|
||||
delete_from_archive('nettime_bold', "../MAILINGLISTSARCHIVES")
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user