import os, json, glob, logging ARCH = "archives/" EXP = "selection/" sel = os.path.join(EXP, "tm-selection.js") sel_dump = os.path.join(EXP, "tm-selection-dump.js") with open(sel) as f: d = json.load(f) def lists(): return os.listdir(sel.ARCH) def tags(): global d return list(d.keys()) def find(li, url): d = os.path.join(ARCH, li) if not os.path.isdir(d): logging.warning("Invalid archive path: " + d) print("Invalid archive path: " + d) return None dir_files = [f for f in glob.glob(os.path.join(d, "*.json"))] for f in dir_files: with open(f) as fp: dj = json.load(fp) for t in dj['threads']: if t['url'] == url: # one level..... not recursive return t return None def recursive_urls(msg): r = [msg['url']] if 'follow-up' in list(msg.keys()): for m in msg['follow-up']: r += recursive_urls(m) return r def commit_selection(li, url, tag): if tag not in list(d.keys()): print("new tag: " + tag) d[tag] = [] for i in d[tag]: if i['url'] == url: return False d[tag].append({'list': li, 'url': url}) with open(sel, 'w', encoding='utf-8') as f: json.dump(d, f, ensure_ascii=False, indent=4) return True def commit_dump(li, url, tag): if not commit_selection(li, url, tag): return None m = find(li, url) # <--- time if m is not None: with open(sel_dump) as f: dump = json.load(f) if tag not in list(dump.keys()): dump[tag] = [] dump[tag].append(m) with open(sel_dump, 'w+', encoding='utf-8') as fout: json.dump(dump, fout, ensure_ascii=False, indent=4) commited = recursive_urls(m) return commited return None def commit_from_selection(): dump = {} with open(sel) as f: d = json.load(f) for k, v in d.items(): dump[k] = [] for i in v: m = find(i['list'], i['url']) # <--- time if m is not None: m['list'] = i['list'] dump[k].append(m) with open(sel_dump, 'w+', encoding='utf-8') as f: json.dump(dump, f, ensure_ascii=False, indent=4) def report(): re = "Report: \n" for k, v in d.items(): lre = {} for i in v: if i['list'] not in lre: lre[i['list']] = 0 lre[i['list']] += 1 re += "<" + k + ">: " + str(len(v)) + " (" for kk, vv in lre.items(): re += kk + ": " + str(vv) + " / " re += ")\n" return re def recursive_format(msg): msg.pop('id') msg['len'] = len(msg['content']) msg.pop('content') msg.pop('content-type') if 'to' in msg: msg.pop('to') if 'message-id' in msg: msg.pop('message-id') if 'follow-up' in msg: for i in msg['follow-up']: recursive_format(i) def format_selection(): with open(sel_dump) as f: d = json.load(f) for k, v in d.items(): for i in v: recursive_format(i) return d def recursive_hashmap(msg, tag, hm): hm[msg['url']] = tag if 'follow-up' in msg: for i in msg['follow-up']: recursive_hashmap(i, tag, hm) def hashmap(): with open(sel_dump) as f: d = json.load(f) hm = {} for k, v in d.items(): for i in v: recursive_hashmap(i, k, hm) return hm if __name__ == "__main__": d = format_selection() print(json.dumps(d, indent=4, sort_keys=True))