Mathematics-and-Artifice/fetch_bib.py

import pathlib, re, frontmatter, markdown, citeproc, json
from pyzotero import zotero
from html.parser import HTMLParser
import utils

CEND = '\33[0m'
CRED = '\33[31m'
CGREEN = '\33[32m'
CVIOLET = '\33[35m'
CBLUE   = '\33[34m'

re_html = '<.*?>'
re_punc = r'[^\w\s]'
CLEANR = re.compile(f'{re_html}|{re_punc}') 

def format_reading(title:str, desc:str):
	c = frontmatter.Post(content=desc)
	c['title'] = title
	c['type'] = 'reading'
	return c

def compare_readings(path:pathlib.PosixPath, title:str, desc:str):
	p = frontmatter.load(path)
	pd = p.to_dict()
	pd['content'] = pd['content'].strip()
	return p, (pd == {'title': title, 'type': 'reading', 'content': desc.strip()})

def update_reading(previous:frontmatter.Post, new:frontmatter.Post):
	keys = set(previous.keys()).union(set(new.keys()))
	for k in keys:
		if not str(previous[k]).strip() == str(new[k]).strip():
			print(f"Update '{k}' (y/n)?\n\tprev: {CRED}{previous[k]}{CEND}\n\tnew: {CGREEN}{new[k]}{CEND}")
			c = input()
			if c == 'y':
				previous[k] = new[k]
	
	if not previous.content.strip() == new.content.strip():
		print(f"Update 'content' (y/n)?\n\tprev: {CRED}{previous.content}{CEND}\n\tnew: {CGREEN}{new.content}{CEND}")
		c = input()
		if c == 'y':
			previous.content = new.content

	return previous

def valid_bib_entry(csljson):
	v = True
	v = v and not ("full text" in csljson['title'].lower())
	v = v and not ("Table of Contents PDF".lower() in csljson['title'].lower())
	v = v and not ("Submitted Version".lower() in csljson['title'].lower())
	v = v and not ("Includes Bibliographical References".lower() in csljson['title'].lower())
	
	# add more...
	return v

def format_filename_title(data_csl:dict, bib_entry:str):	
	index = bib_entry.lower().find(data_csl['title'].lower())
	if index == -1:
		return None
	index += len(data_csl['title']) + 4 if data_csl['type'] == 'book' else len(data_csl['title']) + 1

	title = bib_entry[:index]
	filename = re.sub(CLEANR, '', title).replace(' ', '-') + ".md"

	return filename, title


if __name__ == "__main__":

	conf = utils.load_conf()

	z = zotero.Zotero(conf['zotero_group_id'], conf['zotero_lib_type'], conf['zotero_api_key'])

	for d in z.collections():		
		collection_name = d['data']['name']
		collection_key = d['key']
		print("---")

		z.add_parameters(content='csljson')
		collection = z.collection_items(collection_key)
		collection.reverse()

		# compile bib

		style_file = pathlib.Path(conf['template']) / 'csl' / 'chicago-author-date.csl'

		src = citeproc.source.json.CiteProcJSON(json_data=collection)
		style = citeproc.CitationStylesStyle(style_file.absolute(), validate=False)
		bib = citeproc.CitationStylesBibliography(style=style, source=src, formatter=citeproc.formatter.html)

		# print(collection)
		entries = [citeproc.CitationItem(e['id']) for e in collection if valid_bib_entry(e)]
		bib.register(citeproc.Citation(entries))

		# because citeproc-py can't design shit...
		kv = dict(zip(bib.keys, [str(e) for e in bib.style.render_bibliography(entries)]))

		# print(kv)

		# process collection and bib

		for e in collection:

			eid = e['id'].lower()

			if eid not in bib.keys:
				continue

			bib_entry = kv[eid]
			filename, title = format_filename_title(e, bib_entry)
			filepath = pathlib.Path(conf['content']) / "bibliography" / collection_name / filename

			if not filepath.exists():
				print(f"new reading: {title}")
				new = format_reading(title=title, desc=bib_entry)
				utils.save_file(filepath, frontmatter.dumps(new), mkdirs=True)

			else:
				prev, eq = compare_readings(filepath, title, bib_entry)

				if eq:
					print(f"reading {CVIOLET}{title}{CEND} already exists... continuing")
					continue
				print(f"updating reading: {CBLUE}{title}{CEND}")

				## selective update
				new = format_reading(title=title, desc=bib_entry)

				updated = update_reading(prev, new)
				utils.save_file(filepath, frontmatter.dumps(updated), overwrite=True)

				print(f"reading {e} updated")
haha 2025-09-21 06:54:25 +02:00			`import pathlib, re, frontmatter, markdown, citeproc, json`
			`from pyzotero import zotero`
			`from html.parser import HTMLParser`
			`import utils`

			`CEND = '\33[0m'`
			`CRED = '\33[31m'`
			`CGREEN = '\33[32m'`
			`CVIOLET = '\33[35m'`
			`CBLUE = '\33[34m'`

			`re_html = '<.*?>'`
			`re_punc = r'[^\w\s]'`
			`CLEANR = re.compile(f'{re_html}\|{re_punc}')`

			`def format_reading(title:str, desc:str):`
			`c = frontmatter.Post(content=desc)`
			`c['title'] = title`
			`c['type'] = 'reading'`
			`return c`

			`def compare_readings(path:pathlib.PosixPath, title:str, desc:str):`
			`p = frontmatter.load(path)`
			`pd = p.to_dict()`
			`pd['content'] = pd['content'].strip()`
			`return p, (pd == {'title': title, 'type': 'reading', 'content': desc.strip()})`

			`def update_reading(previous:frontmatter.Post, new:frontmatter.Post):`
			`keys = set(previous.keys()).union(set(new.keys()))`
			`for k in keys:`
			`if not str(previous[k]).strip() == str(new[k]).strip():`
			`print(f"Update '{k}' (y/n)?\n\tprev: {CRED}{previous[k]}{CEND}\n\tnew: {CGREEN}{new[k]}{CEND}")`
			`c = input()`
			`if c == 'y':`
			`previous[k] = new[k]`

			`if not previous.content.strip() == new.content.strip():`
			`print(f"Update 'content' (y/n)?\n\tprev: {CRED}{previous.content}{CEND}\n\tnew: {CGREEN}{new.content}{CEND}")`
			`c = input()`
			`if c == 'y':`
			`previous.content = new.content`

			`return previous`

			`def valid_bib_entry(csljson):`
			`v = True`
			`v = v and not ("full text" in csljson['title'].lower())`
			`v = v and not ("Table of Contents PDF".lower() in csljson['title'].lower())`
			`v = v and not ("Submitted Version".lower() in csljson['title'].lower())`
			`v = v and not ("Includes Bibliographical References".lower() in csljson['title'].lower())`

			`# add more...`
			`return v`

			`def format_filename_title(data_csl:dict, bib_entry:str):`
			`index = bib_entry.lower().find(data_csl['title'].lower())`
			`if index == -1:`
			`return None`
			`index += len(data_csl['title']) + 4 if data_csl['type'] == 'book' else len(data_csl['title']) + 1`

			`title = bib_entry[:index]`
			`filename = re.sub(CLEANR, '', title).replace(' ', '-') + ".md"`

			`return filename, title`


			`if __name__ == "__main__":`

			`conf = utils.load_conf()`

			`z = zotero.Zotero(conf['zotero_group_id'], conf['zotero_lib_type'], conf['zotero_api_key'])`

			`for d in z.collections():`
			`collection_name = d['data']['name']`
			`collection_key = d['key']`
			`print("---")`

			`z.add_parameters(content='csljson')`
			`collection = z.collection_items(collection_key)`
			`collection.reverse()`

			`# compile bib`

			`style_file = pathlib.Path(conf['template']) / 'csl' / 'chicago-author-date.csl'`

			`src = citeproc.source.json.CiteProcJSON(json_data=collection)`
			`style = citeproc.CitationStylesStyle(style_file.absolute(), validate=False)`
			`bib = citeproc.CitationStylesBibliography(style=style, source=src, formatter=citeproc.formatter.html)`

			`# print(collection)`
			`entries = [citeproc.CitationItem(e['id']) for e in collection if valid_bib_entry(e)]`
			`bib.register(citeproc.Citation(entries))`

			`# because citeproc-py can't design shit...`
			`kv = dict(zip(bib.keys, [str(e) for e in bib.style.render_bibliography(entries)]))`

			`# print(kv)`

			`# process collection and bib`

			`for e in collection:`

			`eid = e['id'].lower()`

			`if eid not in bib.keys:`
			`continue`

			`bib_entry = kv[eid]`
			`filename, title = format_filename_title(e, bib_entry)`
			`filepath = pathlib.Path(conf['content']) / "bibliography" / collection_name / filename`

			`if not filepath.exists():`
			`print(f"new reading: {title}")`
			`new = format_reading(title=title, desc=bib_entry)`
			`utils.save_file(filepath, frontmatter.dumps(new), mkdirs=True)`

			`else:`
			`prev, eq = compare_readings(filepath, title, bib_entry)`

			`if eq:`
			`print(f"reading {CVIOLET}{title}{CEND} already exists... continuing")`
			`continue`
			`print(f"updating reading: {CBLUE}{title}{CEND}")`

			`## selective update`
			`new = format_reading(title=title, desc=bib_entry)`

			`updated = update_reading(prev, new)`
			`utils.save_file(filepath, frontmatter.dumps(updated), overwrite=True)`

			`print(f"reading {e} updated")`