2015-02-26 17:15:41 +01:00
#!/usr/bin/env python
2015-02-26 13:54:26 +01:00
from __future__ import print_function
import sys , argparse , json , re , os , time
from urllib2 import urlopen , HTTPError , URLError
import html5lib , urllib2 , urllib
2015-02-26 17:15:41 +01:00
from xml . etree import ElementTree as ET
2015-02-26 13:54:26 +01:00
from urllib import urlencode
from urlparse import urljoin
2015-02-26 17:15:41 +01:00
from datetime import datetime
2015-02-26 13:54:26 +01:00
PADINFO_DEFAULTS = {
" hostname " : " " ,
" apiversion " : " 1.2.9 " ,
" apiurl " : " /api/ "
}
2015-03-05 12:11:21 +01:00
MODULE_PATH = ( os . path . dirname ( __file__ ) )
TEMPLATES_PATH = os . path . join ( MODULE_PATH , " templates " )
2015-02-26 17:15:41 +01:00
verbose = False
2015-02-26 13:54:26 +01:00
def pad_split_group ( n ) :
m = re . match ( r " g \ .( \ w+) \ $(.+)$ " , n )
if m :
return m . groups ( )
else :
return ( ' ' , n )
2015-02-26 17:15:41 +01:00
def content ( tag ) :
if tag . text == None :
return u ' ' . join ( ET . tostring ( e ) for e in tag )
else :
return tag . text + u ' ' . join ( ET . tostring ( e ) for e in tag )
class PadServer ( object ) :
def __init__ ( self , hostname , port = 9001 , apipath = " /api/ " , apiversion = " 1.2.9 " , apikey = None , secure = False ) :
self . hostname = hostname
if secure :
self . protocol = " https "
else :
self . protocol = " http "
self . apiurl = self . protocol + " :// " + hostname
if port :
self . apiurl + = " : {0} " . format ( port )
self . apiurl + = " {0} {1} / " . format ( apipath , apiversion )
self . apikey = apikey
def listAllPads ( self ) :
data = { ' apikey ' : self . apikey }
url = self . apiurl + ' listAllPads? ' + urlencode ( data )
return json . load ( urlopen ( url ) ) [ ' data ' ] [ ' padIDs ' ]
def listAllGroups ( self ) :
data = { ' apikey ' : self . apikey }
url = self . apiurl + ' listAllGroups? ' + urlencode ( data )
return json . load ( urlopen ( url ) ) [ ' data ' ] [ ' groupIDs ' ]
def getPadText ( self , padID ) :
2015-03-05 15:10:16 +01:00
data = { ' apikey ' : self . apikey , ' padID ' : padID . encode ( " utf-8 " ) }
2015-02-26 17:15:41 +01:00
return json . load ( urlopen ( self . apiurl + ' getText? ' + urlencode ( data ) ) ) [ ' data ' ] [ ' text ' ]
def getPadHTML ( self , padID ) :
2015-03-05 15:10:16 +01:00
data = { ' apikey ' : self . apikey , ' padID ' : padID . encode ( " utf-8 " ) }
2015-02-26 17:15:41 +01:00
return json . load ( urlopen ( self . apiurl + ' getHTML? ' + urlencode ( data ) ) ) [ ' data ' ] [ ' html ' ]
def getPadLastEdited ( self , padID ) :
2015-03-05 15:10:16 +01:00
data = { ' apikey ' : self . apikey , ' padID ' : padID . encode ( " utf-8 " ) }
raw = json . load ( urlopen ( self . apiurl + ' getLastEdited? ' + urlencode ( data ) ) ) [ ' data ' ] [ ' lastEdited ' ]
2015-02-26 17:15:41 +01:00
return datetime . fromtimestamp ( int ( raw ) / 1000 )
def getPadURL ( self , padID ) :
group , name = pad_split_group ( padID )
if group :
return self . protocol + " :// " + self . hostname + " /p/ " + padID
else :
return self . protocol + " :// " + self . hostname + " /public_pad/ " + padID
2015-03-05 12:11:21 +01:00
def get_template_env ( tpath = None ) :
import jinja2
paths = [ ]
if tpath and os . path . isdir ( tpath ) :
paths . append ( tpath )
paths . append ( TEMPLATES_PATH )
loader = jinja2 . FileSystemLoader ( paths )
env = jinja2 . Environment ( loader = loader )
return env
# template = env.get_template('pad.html')
# print template.render(the='variables', go='here').encode("utf-8")
2015-03-05 16:04:59 +01:00
def dumpPads ( padserver , padids , outputpath , pub_path , group_path , sleeptime = 0.01 , force = False , templates = None ) :
2015-03-05 12:11:21 +01:00
template_env = get_template_env ( templates )
pad_template = template_env . get_template ( " pad.html " )
numpads = len ( padids )
for i , padid in enumerate ( padids ) :
2015-02-26 13:54:26 +01:00
group_id , pad_name = pad_split_group ( padid )
if group_id :
try :
os . mkdir ( group_path )
except OSError :
pass
try :
os . mkdir ( os . path . join ( group_path , group_id ) )
except OSError :
pass
2015-03-05 16:04:59 +01:00
fp = os . path . join ( outputpath , group_path , group_id , pad_name )
2015-02-26 13:54:26 +01:00
else :
try :
os . mkdir ( pub_path )
except OSError :
pass
2015-03-05 16:04:59 +01:00
fp = os . path . join ( outputpath , pub_path , pad_name )
2015-02-26 13:54:26 +01:00
if verbose :
print ( u " Saving to {0} " . format ( fp ) . encode ( " utf-8 " ) , file = sys . stderr )
2015-03-05 12:11:21 +01:00
else :
sys . stderr . write ( " \r Dumping pads... [ {0} / {1} ] " . format ( i + 1 , numpads ) )
sys . stderr . flush ( )
2015-02-26 13:54:26 +01:00
2015-03-05 15:10:16 +01:00
textpath = fp + " .txt "
htmlpath = fp + " .html "
metapath = fp + " .json "
last_edited = padserver . getPadLastEdited ( padid ) . isoformat ( )
if os . path . exists ( metapath ) :
with open ( metapath ) as f :
meta = json . load ( f )
if not force and meta . get ( " last_edited " ) == last_edited :
if verbose :
print ( " Up to date, skipping " , file = sys . stderr )
continue
2015-02-26 13:54:26 +01:00
meta = {
2015-02-26 17:15:41 +01:00
' pad_id ' : padid ,
' group_id ' : group_id ,
' pad_name ' : pad_name
2015-02-26 13:54:26 +01:00
}
2015-03-05 12:11:21 +01:00
2015-03-05 15:10:16 +01:00
meta [ ' last_edited ' ] = last_edited
2015-02-26 13:54:26 +01:00
# Write Text
2015-02-26 17:15:41 +01:00
with open ( textpath , " w " ) as f :
2015-02-26 13:54:26 +01:00
try :
2015-02-26 17:15:41 +01:00
text = padserver . getPadText ( padid )
f . write ( text . encode ( " utf-8 " ) )
2015-03-05 16:04:59 +01:00
meta [ ' text_path ' ] = os . path . relpath ( textpath , outputpath )
2015-02-26 13:54:26 +01:00
meta [ ' text_length ' ] = len ( text )
2015-02-26 17:15:41 +01:00
meta [ ' text_length_human ' ] = humanize_bytes ( meta [ ' text_length ' ] )
2015-02-26 13:54:26 +01:00
except ( TypeError , HTTPError , ValueError ) as e :
print ( u " Warning: unable to load text for pad {0} , {1} " . format ( padid , e ) . encode ( " utf-8 " ) , file = sys . stderr )
2015-02-26 17:15:41 +01:00
with open ( htmlpath , " w " ) as f :
html = padserver . getPadHTML ( padid )
2015-03-05 16:04:59 +01:00
meta [ ' html_path ' ] = os . path . relpath ( htmlpath , outputpath )
2015-02-26 17:15:41 +01:00
meta [ ' html_length ' ] = len ( html )
2015-03-05 12:11:21 +01:00
if pad_template :
2015-02-26 17:15:41 +01:00
t = html5lib . parse ( html , treebuilder = " etree " , namespaceHTMLElements = False )
body = t . find ( " .//body " )
title = padid
editurl = padserver . getPadURL ( padid )
meta [ ' url ' ] = editurl
2015-03-05 12:11:21 +01:00
json_dump = json . dumps ( meta )
f . write ( pad_template . render (
2015-02-26 17:15:41 +01:00
body = content ( body ) ,
title = title ,
editurl = editurl ,
sourceurl = textpath ,
2015-03-05 12:11:21 +01:00
metadata_json = json_dump ) . encode ( " utf-8 " ) ) # unicode error HERE!
2015-02-26 17:15:41 +01:00
else :
f . write ( html . encode ( " utf-8 " ) )
2015-02-26 13:54:26 +01:00
2015-02-26 17:15:41 +01:00
# except (TypeError, HTTPError, ValueError) as e:
# print(u"Warning: unable to load HTML for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr)
2015-02-26 13:54:26 +01:00
2015-03-05 15:10:16 +01:00
with open ( metapath , " w " ) as f :
2015-02-26 13:54:26 +01:00
f . write ( json . dumps ( meta ) )
if sleeptime :
time . sleep ( sleeptime )
2015-03-05 12:11:21 +01:00
if not verbose :
sys . stderr . write ( " \r Dumping pads... [ {0} ] \n " . format ( numpads ) )
sys . stderr . flush ( )
2015-02-26 17:15:41 +01:00
def humanize_bytes ( bytes , precision = 0 ) :
""" Return a humanized string representation of a number of bytes.
Assumes `from __future__ import division`.
>>> humanize_bytes(1)
' 1 byte '
>>> humanize_bytes(1024)
' 1.0 kB '
>>> humanize_bytes(1024*123)
' 123.0 kB '
>>> humanize_bytes(1024*12342)
' 12.1 MB '
>>> humanize_bytes(1024*12342,2)
' 12.05 MB '
>>> humanize_bytes(1024*1234,2)
' 1.21 MB '
>>> humanize_bytes(1024*1234*1111,2)
' 1.31 GB '
>>> humanize_bytes(1024*1234*1111,1)
' 1.3 GB '
"""
abbrevs = (
( 1 << 50 L , ' Petabyte ' ) ,
( 1 << 40 L , ' Tb ' ) ,
( 1 << 30 L , ' Gb ' ) ,
( 1 << 20 L , ' Mb ' ) ,
( 1 << 10 L , ' kb ' ) ,
( 1 , ' bytes ' )
)
if bytes == 1 :
return ' 1 byte '
for factor , suffix in abbrevs :
if bytes > = factor :
break
return ' %.*f %s ' % ( precision , bytes / factor , suffix )
def padids_from_path ( path ) :
from glob import glob
inputs = glob ( os . path . join ( path , " *.json " ) )
inputs . sort ( )
pads = [ ]
for fp in inputs :
with open ( fp ) as f :
info = json . load ( f )
info [ ' path ' ] = fp
pads . append ( info )
return pads
2015-02-26 13:54:26 +01:00
if __name__ == " __main__ " :
parser = argparse . ArgumentParser ( )
2015-03-05 15:10:16 +01:00
# command
2015-02-26 13:54:26 +01:00
parser . add_argument ( ' command ' , default = " " , help = ' command to perform: listpads, listgroups, dump, createindex ' )
2015-03-05 15:10:16 +01:00
# padinfo
2015-02-26 13:54:26 +01:00
parser . add_argument ( ' --padinfo ' , default = " padinfo.json " , help = ' path to JSON file containing all pad login data, default padinfo.json, alternatively specify hostname, port, etc as separate options ' )
parser . add_argument ( ' --hostname ' , default = " " , help = ' the hostname of the etherpad server ' )
parser . add_argument ( ' --port ' , type = int , help = ' port of etherpad server ' )
parser . add_argument ( ' --apikey ' , help = ' API key ' )
parser . add_argument ( ' --apiversion ' , help = ' the version of the etherpad api ' )
parser . add_argument ( ' --apiurl ' , help = ' URL path to the API ' )
parser . add_argument ( ' --verbose ' , action = " store_true " , default = False , help = ' debug mode, verbose output ' )
2015-03-05 16:04:59 +01:00
parser . add_argument ( ' --outputpath ' , default = os . getcwd ( ) , help = ' path for output, default is . ' )
2015-02-26 13:54:26 +01:00
parser . add_argument ( ' --pubpath ' , default = " pub " , help = ' path to dump public pads ' )
parser . add_argument ( ' --grouppath ' , default = " priv " , help = ' path to dump group pads ' )
2015-03-05 12:11:21 +01:00
parser . add_argument ( ' --templates ' , default = os . path . join ( os . getcwd ( ) , " templates " ) , help = ' (addition) templates path, default: ./templates ' )
2015-02-26 17:15:41 +01:00
2015-03-05 15:10:16 +01:00
# listpads/groups-specific
parser . add_argument ( ' --lines ' , default = False , action = " store_true " , help = ' (listpads/groups) output one per line instead of JSON ' )
# dump-specific
parser . add_argument ( ' --force ' , default = False , action = " store_true " , help = ' (dump) force dump even if up to date ' )
parser . add_argument ( ' --skip ' , default = None , type = int , help = ' (dump) skip this many (start at index) ' )
parser . add_argument ( ' --limit ' , default = None , type = int , help = ' (dump) stop after limit items ' )
# index-specific
2015-03-05 12:11:21 +01:00
parser . add_argument ( ' --title ' , default = " etherpad index & archive " , help = ' (index) title ' )
parser . add_argument ( ' --exclude-groups ' , default = False , action = " store_true " , help = ' (index) ignore groups ' )
2015-03-05 15:10:16 +01:00
parser . add_argument ( ' --groupinfo ' , default = None , help = ' (index) groupinfo json file ' )
2015-03-05 12:11:21 +01:00
parser . add_argument ( ' --output ' , default = None , help = ' (index) path for output (default stdout) ' )
2015-02-26 17:15:41 +01:00
2015-02-26 13:54:26 +01:00
args = parser . parse_args ( )
verbose = args . verbose
padinfo = PADINFO_DEFAULTS
if args . padinfo :
try :
with open ( args . padinfo ) as f :
for key , value in json . load ( f ) . items ( ) :
padinfo [ key ] = value
except IOError , e :
print ( " WARNING: Tried to open {0} , but couldn ' t ( {1} ) " . format ( args . padinfo , e ) , file = sys . stderr )
except ValueError , e :
print ( " WARNING: Error reading {0} , check the JSON syntax ( {1} ) " . format ( args . padinfo , e ) )
# allow explicit opts to override
2015-02-26 17:15:41 +01:00
2015-02-26 13:54:26 +01:00
if args . hostname :
padinfo [ ' hostname ' ] = args . hostname
if args . port :
padinfo [ ' port ' ] = args . port
if args . apikey :
padinfo [ ' apikey ' ] = args . apikey
if args . apiversion :
padinfo [ ' apiversion ' ] = args . apiversion
if args . apiurl :
padinfo [ ' apiurl ' ] = args . apiurl
2015-02-26 17:15:41 +01:00
padserver = PadServer (
hostname = padinfo . get ( " hostname " ) ,
port = padinfo . get ( " port " ) ,
apipath = padinfo . get ( " apiurl " ) ,
apiversion = padinfo . get ( " apiversion " ) ,
apikey = padinfo . get ( " apikey " )
)
2015-02-26 13:54:26 +01:00
if verbose :
2015-02-26 17:15:41 +01:00
print ( " Connecting to {0} " . format ( padserver . apiurl ) , file = sys . stderr )
2015-02-26 13:54:26 +01:00
###############################
# Command Dispatch
###############################
cmd = args . command . lower ( )
if cmd == " listpads " :
2015-02-26 17:15:41 +01:00
padids = padserver . listAllPads ( )
if not args . lines :
2015-02-26 13:54:26 +01:00
json . dump ( padids , sys . stdout )
else :
for padid in padids :
print ( padid )
elif cmd == " listgroups " :
2015-02-26 17:15:41 +01:00
groupids = padserver . listAllGroups ( )
if not args . lines :
2015-02-26 13:54:26 +01:00
json . dump ( groupids , sys . stdout )
else :
for gid in groupids :
print ( gid )
elif cmd == " dump " :
start = time . time ( )
2015-02-26 17:15:41 +01:00
padids = padserver . listAllPads ( )
2015-03-05 15:10:16 +01:00
if args . skip :
padids = padids [ args . skip : ]
2015-02-26 17:15:41 +01:00
if args . limit :
padids = padids [ : args . limit ]
2015-03-05 12:11:21 +01:00
2015-02-26 17:15:41 +01:00
dumpPads (
padserver ,
padids ,
2015-03-05 16:04:59 +01:00
args . outputpath ,
2015-02-26 17:15:41 +01:00
args . pubpath ,
args . grouppath ,
2015-03-05 15:10:16 +01:00
force = args . force ,
2015-03-05 12:11:21 +01:00
templates = args . templates )
2015-02-26 13:54:26 +01:00
if verbose :
print ( " Completed in {0:0.0f} seconds " . format ( time . time ( ) - start ) , file = sys . stderr )
2015-03-05 12:11:21 +01:00
elif cmd == " index " :
2015-02-26 17:15:41 +01:00
2015-03-05 15:10:16 +01:00
def augment_info ( info , groupinfo ) :
if info . get ( " last_edited " ) != None :
dt = datetime . strptime ( info . get ( " last_edited " ) , " % Y- % m- %d T % H: % M: % S " )
info [ ' last_edited_parsed ' ] = dt
info [ ' last_edited_str ' ] = str ( dt )
if groupinfo :
gid = info . get ( " group_id " )
if gid . startswith ( " g. " ) :
gid = gid [ 2 : ]
if gid in groupinfo :
info [ u " group_name " ] = groupinfo [ gid ] . get ( " name " )
# print (info, file=sys.stderr)
return info
2015-02-26 17:15:41 +01:00
def get_pads ( groupinfo = None ) :
pads = padids_from_path ( args . pubpath )
2015-03-05 15:10:16 +01:00
pads = [ augment_info ( x , groupinfo ) for x in pads ]
2015-03-05 12:11:21 +01:00
# print (("padids_from_path", args.pubpath, pads), file=sys.stderr)
2015-02-26 17:15:41 +01:00
if not args . exclude_groups and os . path . exists ( args . grouppath ) :
groups = [ os . path . join ( args . grouppath , x ) for x in os . listdir ( args . grouppath ) ]
groups = [ x for x in groups if os . path . isdir ( x ) ]
groups . sort ( )
for gp in groups :
if groupinfo :
b = os . path . basename ( gp )
if b not in groupinfo :
continue
try :
2015-03-05 15:10:16 +01:00
pad_infos = padids_from_path ( gp )
pad_infos = [ augment_info ( x , groupinfo ) for x in pad_infos ]
pads . extend ( pad_infos )
2015-02-26 17:15:41 +01:00
except OSError :
pass
return pads
groupinfo = None
if args . groupinfo :
with open ( args . groupinfo ) as gif :
groupinfo = json . load ( gif )
2015-03-05 15:10:16 +01:00
if verbose :
print ( " Using groupinfo " , file = sys . stderr )
2015-02-26 17:15:41 +01:00
pads = get_pads ( groupinfo )
padids = [ ( x . get ( " pad_name " ) . lower ( ) , x ) for x in pads ]
padids . sort ( )
pads = [ x [ 1 ] for x in padids ]
out = sys . stdout
if args . output :
out = open ( args . output , " w " )
import jinja2
2015-03-05 12:11:21 +01:00
env = get_template_env ( args . templates )
index_template = env . get_template ( " index.html " )
out . write ( index_template . render (
pads = pads ,
2015-03-05 15:10:16 +01:00
title = args . title ,
timestamp = datetime . now ( )
2015-03-05 12:11:21 +01:00
) . encode ( " utf-8 " ) )
2015-02-26 17:15:41 +01:00
if args . output :
2015-03-05 12:11:21 +01:00
out . close ( )
2015-02-26 17:15:41 +01:00
2015-02-26 13:54:26 +01:00
else :
2015-02-26 17:15:41 +01:00
print ( " Command ' {0} ' not understood, try: listpads, listgroups, dump " . format ( args . command ) , file = sys . stderr )