archive work
This commit is contained in:
parent
8a92b3b1be
commit
7892f1fa64
@ -133,8 +133,7 @@ class Archive:
|
|||||||
|
|
||||||
logging.info(" - done.")
|
logging.info(" - done.")
|
||||||
|
|
||||||
|
def insert_db(self, config=None, file=None):
|
||||||
def insert_db(self, config=None):
|
|
||||||
|
|
||||||
if self.db_con is None:
|
if self.db_con is None:
|
||||||
if config is not None:
|
if config is not None:
|
||||||
@ -143,6 +142,10 @@ class Archive:
|
|||||||
if self.db_con is None:
|
if self.db_con is None:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
tag = file
|
||||||
|
if file is not None:
|
||||||
|
tag = file.replace("_", " ")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
cursor = self.db_con.cursor()
|
cursor = self.db_con.cursor()
|
||||||
|
|
||||||
@ -150,6 +153,14 @@ class Archive:
|
|||||||
|
|
||||||
for t in self.data:
|
for t in self.data:
|
||||||
|
|
||||||
|
# print(tag)
|
||||||
|
print(t['name'])
|
||||||
|
|
||||||
|
if file is not None and t['name'] != tag:
|
||||||
|
continue
|
||||||
|
|
||||||
|
logging.info("inserting " + t['name'])
|
||||||
|
|
||||||
n_inserted = self.recursive_insert_db(cursor, t["threads"])
|
n_inserted = self.recursive_insert_db(cursor, t["threads"])
|
||||||
logging.info(" - " + str(n_inserted))
|
logging.info(" - " + str(n_inserted))
|
||||||
if n_inserted > 0:
|
if n_inserted > 0:
|
||||||
|
|||||||
7
index.py
7
index.py
@ -9,7 +9,7 @@ logging.basicConfig(level=logging.DEBUG)
|
|||||||
def list_archives(archives_dir):
|
def list_archives(archives_dir):
|
||||||
return [d for d in os.listdir(archives_dir) if os.path.isdir(os.path.join(archives_dir, d))]
|
return [d for d in os.listdir(archives_dir) if os.path.isdir(os.path.join(archives_dir, d))]
|
||||||
|
|
||||||
def run(lists, archives):
|
def run(lists, archives, file=None):
|
||||||
logging.debug("indexing: " + str(lists) + " from " + archives)
|
logging.debug("indexing: " + str(lists) + " from " + archives)
|
||||||
lists_db = archive.list_tables_db_config(config.db)
|
lists_db = archive.list_tables_db_config(config.db)
|
||||||
|
|
||||||
@ -21,12 +21,13 @@ def run(lists, archives):
|
|||||||
else:
|
else:
|
||||||
logging.info("Table not created. Aborting.")
|
logging.info("Table not created. Aborting.")
|
||||||
return
|
return
|
||||||
ar.insert_db(config.db)
|
ar.insert_db(config.db, file=file)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
p = argparse.ArgumentParser(description='Mailinglists are dead. Long live mailinglists!')
|
p = argparse.ArgumentParser(description='Mailinglists are dead. Long live mailinglists!')
|
||||||
p.add_argument('list', metavar="list", help="list(s) to index", nargs="+")
|
p.add_argument('list', metavar="list", help="list(s) to index", nargs="+")
|
||||||
|
p.add_argument('--file', metavar="file", help="archive file to index")
|
||||||
p.add_argument('--archives', '-a', help="path to archives directory (default='archives')", default=config.archives)
|
p.add_argument('--archives', '-a', help="path to archives directory (default='archives')", default=config.archives)
|
||||||
|
|
||||||
args = p.parse_args()
|
args = p.parse_args()
|
||||||
@ -37,6 +38,6 @@ if __name__ == "__main__":
|
|||||||
if len(args.list) == 1 and args.list[0] == "all":
|
if len(args.list) == 1 and args.list[0] == "all":
|
||||||
args.list = list_archives(args.archives)
|
args.list = list_archives(args.archives)
|
||||||
|
|
||||||
run(args.list, args.archives)
|
run(args.list, args.archives, args.file)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -16,11 +16,11 @@ def collect_from_url(url, name, base_archive_dir):
|
|||||||
soup = lists.util.request(url)
|
soup = lists.util.request(url)
|
||||||
|
|
||||||
threads_list = soup.find_all('tr', {'class': 'normalgroup'})[0].find_all('li')
|
threads_list = soup.find_all('tr', {'class': 'normalgroup'})[0].find_all('li')
|
||||||
lists = []
|
li = []
|
||||||
for t in threads_list:
|
for t in threads_list:
|
||||||
thread_label = t.text.strip()
|
thread_label = t.text.strip()
|
||||||
thread_url = urllib.parse.urljoin(url, t.select('a')[0].get('href'))
|
thread_url = urllib.parse.urljoin(url, t.select('a')[0].get('href'))
|
||||||
lists.append((thread_label, thread_url))
|
li.append((thread_label, thread_url))
|
||||||
|
|
||||||
# create (main) directory
|
# create (main) directory
|
||||||
# this is where all temp files will be created
|
# this is where all temp files will be created
|
||||||
@ -29,9 +29,9 @@ def collect_from_url(url, name, base_archive_dir):
|
|||||||
os.makedirs(d)
|
os.makedirs(d)
|
||||||
|
|
||||||
threads = []
|
threads = []
|
||||||
nbr_threads = str(len(lists))
|
nbr_threads = str(len(li))
|
||||||
n = 0
|
n = 0
|
||||||
for l in lists: ### change this
|
for l in li: ### change this
|
||||||
n += 1
|
n += 1
|
||||||
logging.info("## " + str(n) + " / " + nbr_threads + " ##")
|
logging.info("## " + str(n) + " / " + nbr_threads + " ##")
|
||||||
try:
|
try:
|
||||||
@ -85,18 +85,18 @@ def collect_threads_from_url(url, name, base_arch_dir):
|
|||||||
soup = lists.util.request(url)
|
soup = lists.util.request(url)
|
||||||
|
|
||||||
table = soup.find_all('table', {'class': 'tableframe'})[1].find_all('tr')
|
table = soup.find_all('table', {'class': 'tableframe'})[1].find_all('tr')
|
||||||
lists = []
|
li = []
|
||||||
for tr in table:
|
for tr in table:
|
||||||
if tr.has_attr('class') and (tr['class'][0] == 'normalgroup' or tr['class'][0] == 'emphasizedgroup'):
|
if tr.has_attr('class') and (tr['class'][0] == 'normalgroup' or tr['class'][0] == 'emphasizedgroup'):
|
||||||
lists.append(tr)
|
li.append(tr)
|
||||||
|
|
||||||
# the thread structure here is flat -- re: non-hierarchical, unlike pipermail
|
# the thread structure here is flat -- re: non-hierarchical, unlike pipermail
|
||||||
# hence the thread parsing algorithm will also be flat -- re: a single loop
|
# hence the thread parsing algorithm will also be flat -- re: a single loop
|
||||||
|
|
||||||
nbr_msgs = str(len(lists))
|
nbr_msgs = str(len(li))
|
||||||
n = 0
|
n = 0
|
||||||
last_message = None
|
last_message = None
|
||||||
for tr in lists:
|
for tr in li:
|
||||||
n += 1
|
n += 1
|
||||||
logging.info(" > " + str(n) + "/" + nbr_msgs)
|
logging.info(" > " + str(n) + "/" + nbr_msgs)
|
||||||
td = tr.find_all('td')
|
td = tr.find_all('td')
|
||||||
|
|||||||
@ -16,7 +16,7 @@ def collect_from_url(url, name, base_archive_dir):
|
|||||||
# soup = BeautifulSoup(html, "html5lib")
|
# soup = BeautifulSoup(html, "html5lib")
|
||||||
|
|
||||||
threads_list = soup.find_all('tr')
|
threads_list = soup.find_all('tr')
|
||||||
lists = []
|
li = []
|
||||||
for t in threads_list[1:]:
|
for t in threads_list[1:]:
|
||||||
cols = t.find_all('td')
|
cols = t.find_all('td')
|
||||||
if len(cols) < 2:
|
if len(cols) < 2:
|
||||||
@ -25,7 +25,7 @@ def collect_from_url(url, name, base_archive_dir):
|
|||||||
thread_url = cols[1].select('a:nth-of-type(1)')[0].get('href') # this is relative
|
thread_url = cols[1].select('a:nth-of-type(1)')[0].get('href') # this is relative
|
||||||
url = (url + "/") if not url.endswith('/') else url
|
url = (url + "/") if not url.endswith('/') else url
|
||||||
thread_url = urllib.parse.urljoin(url, thread_url)
|
thread_url = urllib.parse.urljoin(url, thread_url)
|
||||||
lists.append((thread_label, thread_url)) # list of tuples
|
li.append((thread_label, thread_url)) # list of tuples
|
||||||
|
|
||||||
# create (main) directory
|
# create (main) directory
|
||||||
# this is where all temp files will be created
|
# this is where all temp files will be created
|
||||||
@ -34,9 +34,9 @@ def collect_from_url(url, name, base_archive_dir):
|
|||||||
os.makedirs(d)
|
os.makedirs(d)
|
||||||
|
|
||||||
threads = []
|
threads = []
|
||||||
nbr_threads = str(len(lists))
|
nbr_threads = str(len(li))
|
||||||
n = 0
|
n = 0
|
||||||
for l in lists: ### change this
|
for l in li: ### change this
|
||||||
n += 1
|
n += 1
|
||||||
logging.info("## " + str(n) + " / " + nbr_threads + " ##")
|
logging.info("## " + str(n) + " / " + nbr_threads + " ##")
|
||||||
try:
|
try:
|
||||||
@ -83,14 +83,14 @@ def collect_threads_from_url(url, name, base_arch_dir):
|
|||||||
# soup = BeautifulSoup(html, "html5lib")
|
# soup = BeautifulSoup(html, "html5lib")
|
||||||
|
|
||||||
ul = soup.find_all('ul')[1];
|
ul = soup.find_all('ul')[1];
|
||||||
lists = ul.find_all('li', recursive=False)
|
li = ul.find_all('li', recursive=False)
|
||||||
|
|
||||||
is_mhonarc_hybrid = soup.find(text=re.compile('MHonArc')) is not None
|
is_mhonarc_hybrid = soup.find(text=re.compile('MHonArc')) is not None
|
||||||
|
|
||||||
#lists = soup.select('ul:nth-of-type(2)')[0].find_all('li', recursive=False)
|
#li = soup.select('ul:nth-of-type(2)')[0].find_all('li', recursive=False)
|
||||||
nbr_msgs = str(len(lists))
|
nbr_msgs = str(len(li))
|
||||||
n = 0
|
n = 0
|
||||||
for li in lists:
|
for li in li:
|
||||||
n += 1
|
n += 1
|
||||||
logging.info(" > " + str(n) + "/" + nbr_msgs)
|
logging.info(" > " + str(n) + "/" + nbr_msgs)
|
||||||
try:
|
try:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user