index(es)

2020-01-21 11:38:31 +01:00 · 2020-01-21 11:38:31 +01:00 · afc71795d1
commit afc71795d1
parent cabfe50777
26 changed files with 151318 additions and 13 deletions
--- a/index.py
+++ b/index.py
@ -0,0 +1,75 @@
 import argparse, os, sys, json, logging
 from keywords import rake
 from lxml import etree as et
 import yake
 logging.basicConfig(level=logging.DEBUG)
 def index(f):
 	if not os.path.isfile(f):
 		logging.error(f + " is not a valid file.")
 		return None
 	# filename should be of the type: N.xxxx.xml
 	#ex: 3.Network.xml
 	ch = os.path.basename(f).split('.')[0]
 	indx = {}
 	r = rake.Rake('keywords/FoxStoplist.txt', max_words_length=3, min_keyword_frequency=1)
 	y = yake.KeywordExtractor(lan="en", top=40)
 	root = et.parse(f).getroot()
 	for m in root.findall('mails/mail'):
 		nbr_str = m.find('nbr').text
 		content = m.find('content').text
 		# format nbr
 		nbr_str = ch + '.' + nbr_str
 		# yake
 		try:
 			kwy = y.extract_keywords(content)
 			for k in kwy:
 				kw = k[0]
 				if kw not in indx:
 					indx[kw] = []
 				indx[kw].append(nbr_str)
 		except Exception as e:
 			print(e)
 		# rake
 		try:
 			kwr = r.run(content)
 			kwr = [x for x in kwr if x[1] > 4.0]
 			for k in kwr:
 				kw = k[0]
 				if kw not in indx:
 					indx[kw] = []
 				indx[kw].append(nbr_str)
 		except Exception as e:
 			print(e)
 	return indx
 if __name__ == '__main__':
 	p = argparse.ArgumentParser(description='Builds an index of emails')
 	p.add_argument('file', metavar="f", help="xml file to index")
 	args = p.parse_args()
 	ind = index(args.file)
 	ind_out = {'selected': {}, 'orphan': ind}
 	print(json.dumps(ind_out, indent=4, sort_keys=True, ensure_ascii=False))
--- a/index.sh
+++ b/index.sh
@ -0,0 +1,34 @@
 #!/bin/sh
 echo "1.Welcome.xml -> 1.Welcome-index.js"
 python index.py xml/1.Welcome.xml > index/1.Welcome-index.js
 echo "2.DeepEurope.xml -> 2.DeepEurope-index.js"
 python index.py xml/2.DeepEurope.xml > index/2.DeepEurope-index.js
 echo "3.Network.xml -> 3.Network-index.js"
 python index.py xml/3.Network.xml > index/3.Network-index.js
 echo "4.net.art.xml -> 4.net.art-index.js"
 python index.py xml/4.net.art.xml > index/4.net.art-index.js
 echo "5.Netzkritik.xml -> 5.Netzkritik-index.js"
 python index.py xml/5.Netzkritik.xml > index/5.Netzkritik-index.js
 echo "6.FLOSS.xml -> 6.FLOSS-index.js"
 python index.py xml/6.FLOSS.xml > index/6.FLOSS-index.js
 echo "7.Critique_Art_Politics.xml -> 7.Critique_Art_Politics-index.js"
 python index.py xml/7.Critique_Art_Politics.xml > index/7.Critique_Art_Politics-index.js
 echo "8.TacticalMedia.xml -> 8.TacticalMedia-index.js"
 python index.py xml/8.TacticalMedia.xml > index/8.TacticalMedia-index.js
 echo "9.List_talking_to_List.xml -> 9.List_talking_to_List-index.js"
 python index.py xml/9.List_talking_to_List.xml > index/9.List_talking_to_List-index.js
 echo "10.Cyberfeminism.xml -> 10.Cyberfeminism-index.js"
 python index.py xml/10.Cyberfeminism.xml > index/10.Cyberfeminism-index.js
 echo "11.CODE.xml -> 11.CODE-index.js"
 python index.py xml/11.CODE.xml > index/11.CODE-index.js
 echo "13.Post-digital.xml -> 13.Post-digital-index.js"
 python index.py xml/13.Post-digital.xml > index/13.Post-digital-index.js
 echo "14.MANIFESTO.xml -> 14.MANIFESTO-index.js"
 python index.py xml/14.MANIFESTO.xml > index/14.MANIFESTO-index.js
 echo "15.LutherBlissett.xml -> 15.LutherBlissett-index.js"
 python index.py xml/15.LutherBlissett.xml > index/15.LutherBlissett-index.js
 echo "16.NN.xml -> 16.NN-index.js"
 python index.py xml/16.NN.xml > index/16.NN-index.js
 echo "17.Interviews.xml -> 17.Interviews-index.js"
 python index.py xml/17.Interviews.xml > index/17.Interviews-index.js
--- a/index/1.Welcome-index.js
+++ b/index/1.Welcome-index.js
--- a/index/10.Cyberfeminism-index.js
+++ b/index/10.Cyberfeminism-index.js
--- a/index/11.CODE-index.js
+++ b/index/11.CODE-index.js
--- a/index/13.Post-digital-index.js
+++ b/index/13.Post-digital-index.js
--- a/index/14.MANIFESTO-index.js
+++ b/index/14.MANIFESTO-index.js
--- a/index/15.LutherBlissett-index.js
+++ b/index/15.LutherBlissett-index.js
--- a/index/16.NN-index.js
+++ b/index/16.NN-index.js
--- a/index/17.Interviews-index.js
+++ b/index/17.Interviews-index.js
--- a/index/2.DeepEurope-index.js
+++ b/index/2.DeepEurope-index.js
--- a/index/3.Network-index.js
+++ b/index/3.Network-index.js
--- a/index/4.net.art-index.js
+++ b/index/4.net.art-index.js
--- a/index/5.Netzkritik-index.js
+++ b/index/5.Netzkritik-index.js
--- a/index/6.FLOSS-index.js
+++ b/index/6.FLOSS-index.js
--- a/index/7.Critique_Art_Politics-index.js
+++ b/index/7.Critique_Art_Politics-index.js
--- a/index/8.TacticalMedia-index.js
+++ b/index/8.TacticalMedia-index.js
--- a/index/9.List_talking_to_List-index.js
+++ b/index/9.List_talking_to_List-index.js
--- a/keywords/FoxStoplist.txt
+++ b/keywords/FoxStoplist.txt
@ -0,0 +1,426 @@
 #From "A stop list for general text" Fox 1989
 a
 about
 above
 across
 after
 again
 against
 all
 almost
 alone
 along
 already
 also
 although
 always
 among
 an
 and
 another
 any
 anybody
 anyone
 anything
 anywhere
 are
 area
 areas
 around
 as
 ask
 asked
 asking
 asks
 at
 away
 b
 back
 backed
 backing
 backs
 be
 because
 became
 become
 becomes
 been
 before
 began
 behind
 being
 beings
 best
 better
 between
 big
 both
 but
 by
 c
 came
 can
 cannot
 case
 cases
 certain
 certainly
 clear
 clearly
 come
 could
 d
 did
 differ
 different
 differently
 do
 does
 done
 down
 downed
 downing
 downs
 during
 e
 each
 early
 either
 end
 ended
 ending
 ends
 enough
 even
 evenly
 ever
 every
 everybody
 everyone
 everything
 everywhere
 f
 face
 faces
 fact
 facts
 far
 felt
 few
 find
 finds
 first
 for
 four
 from
 full
 fully
 further
 furthered
 furthering
 furthers
 g
 gave
 general
 generally
 get
 gets
 give
 given
 gives
 go
 going
 good
 goods
 got
 great
 greater
 greatest
 group
 grouped
 grouping
 groups
 h
 had
 has
 have
 having
 he
 her
 herself
 here
 high
 higher
 highest
 him
 himself
 his
 how
 however
 i
 if
 important
 in
 interest
 interested
 interesting
 interests
 into
 is
 it
 its
 itself
 j
 just
 k
 keep
 keeps
 kind
 knew
 know
 known
 knows
 l
 large
 largely
 last
 later
 latest
 least
 less
 let
 lets
 like
 likely
 long
 longer
 longest
 m
 made
 make
 making
 man
 many
 may
 me
 member
 members
 men
 might
 more
 most
 mostly
 mr
 mrs
 much
 must
 my
 myself
 n
 necessary
 need
 needed
 needing
 needs
 never
 new
 newer
 newest
 next
 no
 non
 not
 nobody
 noone
 nothing
 now
 nowhere
 number
 numbered
 numbering
 numbers
 o
 of
 off
 often
 old
 older
 oldest
 on
 once
 one
 only
 open
 opened
 opening
 opens
 or
 order
 ordered
 ordering
 orders
 other
 others
 our
 out
 over
 p
 part
 parted
 parting
 parts
 per
 perhaps
 place
 places
 point
 pointed
 pointing
 points
 possible
 present
 presented
 presenting
 presents
 problem
 problems
 put
 puts
 q
 quite
 r
 rather
 really
 right
 room
 rooms
 s
 said
 same
 saw
 say
 says
 second
 seconds
 see
 seem
 seemed
 seeming
 seems
 sees
 several
 shall
 she
 should
 show
 showed
 showing
 shows
 side
 sides
 since
 small
 smaller
 smallest
 so
 some
 somebody
 someone
 something
 somewhere
 state
 states
 still
 such
 sure
 t
 take
 taken
 than
 that
 the
 their
 them
 then
 there
 therefore
 these
 they
 thing
 things
 think
 thinks
 this
 those
 though
 thought
 thoughts
 three
 through
 thus
 to
 today
 together
 too
 took
 toward
 turn
 turned
 turning
 turns
 two
 u
 under
 until
 up
 upon
 us
 use
 uses
 used
 v
 very
 w
 want
 wanted
 wanting
 wants
 was
 way
 ways
 we
 well
 wells
 went
 were
 what
 when
 where
 whether
 which
 while
 who
 whole
 whose
 why
 will
 with
 within
 without
 work
 worked
 working
 works
 would
 x
 y
 year
 years
 yet
 you
 young
 younger
 youngest
 your
 yours
 z
--- a/keywords/oldrake.py
+++ b/keywords/oldrake.py
@ -0,0 +1,168 @@
 # Implementation of RAKE - Rapid Automtic Keyword Exraction algorithm
 # as described in:
 # Rose, S., D. Engel, N. Cramer, and W. Cowley (2010). 
 # Automatic keyword extraction from indi-vidual documents. 
 # In M. W. Berry and J. Kogan (Eds.), Text Mining: Applications and Theory.unknown: John Wiley and Sons, Ltd.
 import re
 import operator
 debug = False
 test = True
 def is_number(s):
    try:
        float(s) if '.' in s else int(s)
        return True
    except ValueError:
        return False
 def load_stop_words(stop_word_file):
    """
    Utility function to load stop words from a file and return as a list of words
    @param stop_word_file Path and file name of a file containing stop words.
    @return list A list of stop words.
    """
    stop_words = []
    for line in open(stop_word_file):
        if line.strip()[0:1] != "#":
            for word in line.split():  # in case more than one per line
                stop_words.append(word)
    return stop_words
 def separate_words(text, min_word_return_size):
    """
    Utility function to return a list of all words that are have a length greater than a specified number of characters.
    @param text The text that must be split in to words.
    @param min_word_return_size The minimum no of characters a word must have to be included.
    """
    splitter = re.compile('[^a-zA-Z0-9_\\+\\-/]')
    words = []
    for single_word in splitter.split(text):
        current_word = single_word.strip().lower()
        #leave numbers in phrase, but don't count as words, since they tend to invalidate scores of their phrases
        if len(current_word) > min_word_return_size and current_word != '' and not is_number(current_word):
            words.append(current_word)
    return words
 def split_sentences(text):
    """
    Utility function to return a list of sentences.
    @param text The text that must be split in to sentences.
    """
    sentence_delimiters = re.compile(u'[.!?,;:\t\\\\"\\(\\)\\\'\u2019\u2013]|\\s\\-\\s')
    sentences = sentence_delimiters.split(text)
    return sentences
 def build_stop_word_regex(stop_word_file_path):
    stop_word_list = load_stop_words(stop_word_file_path)
    stop_word_regex_list = []
    for word in stop_word_list:
        word_regex = r'\b' + word + r'(?![\w-])'  # added look ahead for hyphen
        stop_word_regex_list.append(word_regex)
    stop_word_pattern = re.compile('|'.join(stop_word_regex_list), re.IGNORECASE)
    return stop_word_pattern
 def generate_candidate_keywords(sentence_list, stopword_pattern):
    phrase_list = []
    for s in sentence_list:
        tmp = re.sub(stopword_pattern, '|', s.strip())
        phrases = tmp.split("|")
        for phrase in phrases:
            phrase = phrase.strip().lower()
            if phrase != "":
                phrase_list.append(phrase)
    return phrase_list
 def calculate_word_scores(phraseList):
    word_frequency = {}
    word_degree = {}
    for phrase in phraseList:
        word_list = separate_words(phrase, 0)
        word_list_length = len(word_list)
        word_list_degree = word_list_length - 1
        #if word_list_degree > 3: word_list_degree = 3 #exp.
        for word in word_list:
            word_frequency.setdefault(word, 0)
            word_frequency[word] += 1
            word_degree.setdefault(word, 0)
            word_degree[word] += word_list_degree  #orig.
            #word_degree[word] += 1/(word_list_length*1.0) #exp.
    for item in word_frequency:
        word_degree[item] = word_degree[item] + word_frequency[item]
    # Calculate Word scores = deg(w)/frew(w)
    word_score = {}
    for item in word_frequency:
        word_score.setdefault(item, 0)
        word_score[item] = word_degree[item] / (word_frequency[item] * 1.0)  #orig.
    #word_score[item] = word_frequency[item]/(word_degree[item] * 1.0) #exp.
    return word_score
 def generate_candidate_keyword_scores(phrase_list, word_score):
    keyword_candidates = {}
    for phrase in phrase_list:
        keyword_candidates.setdefault(phrase, 0)
        word_list = separate_words(phrase, 0)
        candidate_score = 0
        for word in word_list:
            candidate_score += word_score[word]
        keyword_candidates[phrase] = candidate_score
    return keyword_candidates
 class Rake(object):
    def __init__(self, stop_words_path):
        self.stop_words_path = stop_words_path
        self.__stop_words_pattern = build_stop_word_regex(stop_words_path)
    def run(self, text):
        sentence_list = split_sentences(text)
        phrase_list = generate_candidate_keywords(sentence_list, self.__stop_words_pattern)
        word_scores = calculate_word_scores(phrase_list)
        keyword_candidates = generate_candidate_keyword_scores(phrase_list, word_scores)
        sorted_keywords = sorted(keyword_candidates.items(), key=lambda x: x[1], reverse=True)
        return sorted_keywords
 # if test:
 #     text = "Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types."
 #     # Split text into sentences
 #     sentenceList = split_sentences(text)
 #     #stoppath = "FoxStoplist.txt" #Fox stoplist contains "numbers", so it will not find "natural numbers" like in Table 1.1
 #     stoppath = "SmartStoplist.txt"  #SMART stoplist misses some of the lower-scoring keywords in Figure 1.5, which means that the top 1/3 cuts off one of the 4.0 score words in Table 1.1
 #     stopwordpattern = build_stop_word_regex(stoppath)
 #     # generate candidate keywords
 #     phraseList = generate_candidate_keywords(sentenceList, stopwordpattern)
 #     # calculate individual word scores
 #     wordscores = calculate_word_scores(phraseList)
 #     # generate candidate keyword scores
 #     keywordcandidates = generate_candidate_keyword_scores(phraseList, wordscores)
 #     if debug: print keywordcandidates
 #     sortedKeywords = sorted(keywordcandidates.iteritems(), key=operator.itemgetter(1), reverse=True)
 #     if debug: print sortedKeywords
 #     totalKeywords = len(sortedKeywords)
 #     if debug: print totalKeywords
 #     print sortedKeywords[0:(totalKeywords / 3)]
 #     rake = Rake("SmartStoplist.txt")
 #     keywords = rake.run(text)
 #     print keywords
--- a/keywords/rake.py
+++ b/keywords/rake.py
@ -0,0 +1,301 @@
 # Implementation of RAKE - Rapid Automatic Keyword Extraction algorithm
 # as described in:
 # Rose, S., D. Engel, N. Cramer, and W. Cowley (2010).
 # Automatic keyword extraction from individual documents.
 # In M. W. Berry and J. Kogan (Eds.), Text Mining: Applications and Theory.unknown: John Wiley and Sons, Ltd.
 #
 # NOTE: The original code (from https://github.com/aneesha/RAKE)
 # has been extended by a_medelyan (zelandiya)
 # with a set of heuristics to decide whether a phrase is an acceptable candidate
 # as well as the ability to set frequency and phrase length parameters
 # important when dealing with longer documents
 #
 # NOTE 2: The code published by a_medelyan (https://github.com/zelandiya/RAKE-tutorial)
 # has been additionally extended by Marco Pegoraro to implement the adjoined candidate
 # feature described in section 1.2.3 of the original paper. Note that this creates the
 # need to modify the metric for the candidate score, because the adjoined candidates
 # have a very high score (because of the nature of the original score metric)
 from __future__ import absolute_import
 from __future__ import print_function
 import re
 import operator
 # import six
 # from six.moves import range
 from collections import Counter
 debug = False
 test = False
 def is_number(s):
    try:
        float(s) if '.' in s else int(s)
        return True
    except ValueError:
        return False
 def load_stop_words(stop_word_file):
    """
    Utility function to load stop words from a file and return as a list of words
    @param stop_word_file Path and file name of a file containing stop words.
    @return list A list of stop words.
    """
    stop_words = []
    for line in open(stop_word_file):
        if line.strip()[0:1] != "#":
            for word in line.split():  # in case more than one per line
                stop_words.append(word)
    return stop_words
 def separate_words(text, min_word_return_size):
    """
    Utility function to return a list of all words that are have a length greater than a specified number of characters.
    @param text The text that must be split in to words.
    @param min_word_return_size The minimum no of characters a word must have to be included.
    """
    splitter = re.compile('[^a-zA-Z0-9_\\+\\-/]')
    words = []
    for single_word in splitter.split(text):
        current_word = single_word.strip().lower()
        # leave numbers in phrase, but don't count as words, since they tend to invalidate scores of their phrases
        if len(current_word) > min_word_return_size and current_word != '' and not is_number(current_word):
            words.append(current_word)
    return words
 def split_sentences(text):
    """
    Utility function to return a list of sentences.
    @param text The text that must be split in to sentences.
    """
    sentence_delimiters = re.compile(u'[\\[\\]\n.!?,;:\t\\-\\"\\(\\)\\\'\u2019\u2013]')
    sentences = sentence_delimiters.split(text)
    return sentences
 def build_stop_word_regex(stop_word_list):
    stop_word_regex_list = []
    for word in stop_word_list:
        word_regex = '\\b' + word + '\\b'
        stop_word_regex_list.append(word_regex)
    stop_word_pattern = re.compile('|'.join(stop_word_regex_list), re.IGNORECASE)
    return stop_word_pattern
 #
 # Function that extracts the adjoined candidates from a list of sentences and filters them by frequency
 #
 def extract_adjoined_candidates(sentence_list, stoplist, min_keywords, max_keywords, min_freq):
    adjoined_candidates = []
    for s in sentence_list:
        # Extracts the candidates from each single sentence and adds them to the list
        adjoined_candidates += adjoined_candidates_from_sentence(s, stoplist, min_keywords, max_keywords)
    # Filters the candidates and returns them
    return filter_adjoined_candidates(adjoined_candidates, min_freq)
 # return adjoined_candidates
 #
 # Function that extracts the adjoined candidates from a single sentence
 #
 def adjoined_candidates_from_sentence(s, stoplist, min_keywords, max_keywords):
    # Initializes the candidate list to empty
    candidates = []
    # Splits the sentence to get a list of lowercase words
    sl = s.lower().split()
    # For each possible length of the adjoined candidate
    for num_keywords in range(min_keywords, max_keywords + 1):
        # Until the third-last word
        for i in range(0, len(sl) - num_keywords):
            # Position i marks the first word of the candidate. Proceeds only if it's not a stopword
            if sl[i] not in stoplist:
                candidate = sl[i]
                # Initializes j (the pointer to the next word) to 1
                j = 1
                # Initializes the word counter. This counts the non-stopwords words in the candidate
                keyword_counter = 1
                contains_stopword = False
                # Until the word count reaches the maximum number of keywords or the end is reached
                while keyword_counter < num_keywords and i + j < len(sl):
                    # Adds the next word to the candidate
                    candidate = candidate + ' ' + sl[i + j]
                    # If it's not a stopword, increase the word counter. If it is, turn on the flag
                    if sl[i + j] not in stoplist:
                        keyword_counter += 1
                    else:
                        contains_stopword = True
                    # Next position
                    j += 1
                # Adds the candidate to the list only if:
                # 1) it contains at least a stopword (if it doesn't it's already been considered)
                # AND
                # 2) the last word is not a stopword
                # AND
                # 3) the adjoined candidate keyphrase contains exactly the correct number of keywords (to avoid doubles)
                if contains_stopword and candidate.split()[-1] not in stoplist and keyword_counter == num_keywords:
                    candidates.append(candidate)
    return candidates
 #
 # Function that filters the adjoined candidates to keep only those that appears with a certain frequency
 #
 def filter_adjoined_candidates(candidates, min_freq):
    # Creates a dictionary where the key is the candidate and the value is the frequency of the candidate
    candidates_freq = Counter(candidates)
    filtered_candidates = []
    # Uses the dictionary to filter the candidates
    for candidate in candidates:
        freq = candidates_freq[candidate]
        if freq >= min_freq:
            filtered_candidates.append(candidate)
    return filtered_candidates
 def generate_candidate_keywords(sentence_list, stopword_pattern, stop_word_list, min_char_length=1, max_words_length=5,
                                min_words_length_adj=1, max_words_length_adj=1, min_phrase_freq_adj=2):
    phrase_list = []
    for s in sentence_list:
        tmp = re.sub(stopword_pattern, '|', s.strip())
        phrases = tmp.split("|")
        for phrase in phrases:
            phrase = phrase.strip().lower()
            if phrase != "" and is_acceptable(phrase, min_char_length, max_words_length):
                phrase_list.append(phrase)
    phrase_list += extract_adjoined_candidates(sentence_list, stop_word_list, min_words_length_adj,
                                               max_words_length_adj, min_phrase_freq_adj)
    return phrase_list
 def is_acceptable(phrase, min_char_length, max_words_length):
    # a phrase must have a min length in characters
    if len(phrase) < min_char_length:
        return 0
    # a phrase must have a max number of words
    words = phrase.split()
    if len(words) > max_words_length:
        return 0
    digits = 0
    alpha = 0
    for i in range(0, len(phrase)):
        if phrase[i].isdigit():
            digits += 1
        elif phrase[i].isalpha():
            alpha += 1
    # a phrase must have at least one alpha character
    if alpha == 0:
        return 0
    # a phrase must have more alpha than digits characters
    if digits > alpha:
        return 0
    return 1
 def calculate_word_scores(phraseList):
    word_frequency = {}
    word_degree = {}
    for phrase in phraseList:
        word_list = separate_words(phrase, 0)
        word_list_length = len(word_list)
        word_list_degree = word_list_length - 1
        # if word_list_degree > 3: word_list_degree = 3 #exp.
        for word in word_list:
            word_frequency.setdefault(word, 0)
            word_frequency[word] += 1
            word_degree.setdefault(word, 0)
            word_degree[word] += word_list_degree  # orig.
            # word_degree[word] += 1/(word_list_length*1.0) #exp.
    for item in word_frequency:
        word_degree[item] = word_degree[item] + word_frequency[item]
    # Calculate Word scores = deg(w)/frew(w)
    word_score = {}
    for item in word_frequency:
        word_score.setdefault(item, 0)
        word_score[item] = word_degree[item] / (word_frequency[item] * 1.0)  # orig.
    # word_score[item] = word_frequency[item]/(word_degree[item] * 1.0) #exp.
    return word_score
 def generate_candidate_keyword_scores(phrase_list, word_score, min_keyword_frequency=1):
    keyword_candidates = {}
    for phrase in phrase_list:
        if min_keyword_frequency > 1:
            if phrase_list.count(phrase) < min_keyword_frequency:
                continue
        keyword_candidates.setdefault(phrase, 0)
        word_list = separate_words(phrase, 0)
        candidate_score = 0
        for word in word_list:
            candidate_score += word_score[word]
        keyword_candidates[phrase] = candidate_score
    return keyword_candidates
 class Rake(object):
    def __init__(self, stop_words_path, min_char_length=1, max_words_length=5, min_keyword_frequency=1,
                 min_words_length_adj=1, max_words_length_adj=1, min_phrase_freq_adj=2):
        self.__stop_words_path = stop_words_path
        self.__stop_words_list = load_stop_words(stop_words_path)
        self.__min_char_length = min_char_length
        self.__max_words_length = max_words_length
        self.__min_keyword_frequency = min_keyword_frequency
        self.__min_words_length_adj = min_words_length_adj
        self.__max_words_length_adj = max_words_length_adj
        self.__min_phrase_freq_adj = min_phrase_freq_adj
    def run(self, text):
        sentence_list = split_sentences(text)
        stop_words_pattern = build_stop_word_regex(self.__stop_words_list)
        phrase_list = generate_candidate_keywords(sentence_list, stop_words_pattern, self.__stop_words_list,
                                                  self.__min_char_length, self.__max_words_length,
                                                  self.__min_words_length_adj, self.__max_words_length_adj,
                                                  self.__min_phrase_freq_adj)
        word_scores = calculate_word_scores(phrase_list)
        keyword_candidates = generate_candidate_keyword_scores(phrase_list, word_scores, self.__min_keyword_frequency)
        sorted_keywords = sorted(keyword_candidates.items(), key=lambda x: x[1], reverse=True)
        return sorted_keywords
 # if test and __name__ == '__main__':
 #     text = "Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types."
 #     # Split text into sentences
 #     sentenceList = split_sentences(text)
 #     # stoppath = "FoxStoplist.txt" #Fox stoplist contains "numbers", so it will not find "natural numbers" like in Table 1.1
 #     stoppath = "data/stoplists/SmartStoplist.txt"  # SMART stoplist misses some of the lower-scoring keywords in Figure 1.5, which means that the top 1/3 cuts off one of the 4.0 score words in Table 1.1
 #     stopwordpattern = build_stop_word_regex(stoppath)
 #     # generate candidate keywords
 #     phraseList = generate_candidate_keywords(sentenceList, stopwordpattern, load_stop_words(stoppath))
 #     # calculate individual word scores
 #     wordscores = calculate_word_scores(phraseList)
 #     # generate candidate keyword scores
 #     keywordcandidates = generate_candidate_keyword_scores(phraseList, wordscores)
 #     if debug: print(keywordcandidates)
 #     sortedKeywords = sorted(six.iteritems(keywordcandidates), key=operator.itemgetter(1), reverse=True)
 #     if debug: print(sortedKeywords)
 #     totalKeywords = len(sortedKeywords)
 #     if debug: print(totalKeywords)
 #     print(sortedKeywords[0:(totalKeywords // 3)])
 #     rake = Rake("data/stoplists/SmartStoplist.txt")
 #     keywords = rake.run(text)
 #     print(keywords)
--- a/www/routes.py
+++ b/www/routes.py
@ -3,6 +3,7 @@ from www import app
 import json, logging, os, glob
 from lxml import etree as et
 import config
 from collections import OrderedDict
 def list_all(d, ext):
@ -12,6 +13,112 @@ def list_all(d, ext):
 	return [os.path.basename(f) for f in glob.glob(os.path.join(d, "*." + ext))]
@app.route('/')
 def top():
 	return render_template("index.html")
 '''
 		INDEX
 '''
 def read_index(d, fn):
 	fp = os.path.join(d, fn)
 	if not os.path.isfile(fp):
 		return None
 	with open(fp) as f:
 		index_data = json.load(f, object_pairs_hook=OrderedDict)
 	return index_data
 # def add_selected_kw_index(d, fn, kw):
 # 	fp = os.path.join(d, fn)
 # 	if not os.path.isfile(fp):
 # 		return False
 # 	with open(fp) as f:
 # 		index_data = json.load(f)
 # 	if kw not in index_data['orphan']:
 # 		return False
 # 	v = index_data['orphan'].pop(kw)
 # 	if kw not in index_data['selected']:
 # 		index_data['selected'][kw] = []
 # 	index_data['selected'][kw] += v
 # 	with open(fp, 'w') as fout:
 # 		json.dump(index_data, fout, indent=4, sort_keys=True, ensure_ascii=False)
 # 	return True
 def modify_selected_kw_index(d, fn, kw, action="add"):
 	fp = os.path.join(d, fn)
 	if not os.path.isfile(fp):
 		return False
 	with open(fp) as f:
 		index_data = json.load(f)
 	if action == 'add':
 		in_dic = index_data['selected']
 		out_dic = index_data['orphan']
 	elif action == 'delete':
 		out_dic = index_data['selected']
 		in_dic = index_data['orphan']
 	else:
 		return False
 	if kw not in out_dic:
 		return False
 	v = out_dic.pop(kw)
 	if kw not in in_dic:
 		in_dic[kw] = []
 	in_dic[kw] += v
 	with open(fp, 'w') as fout:
 		json.dump(index_data, fout, indent=4, sort_keys=True, ensure_ascii=False)
 	return True
@app.route('/index', methods = ['GET'])
 def index():
 	if request.method == 'GET':
 		li = list_all(config.index['path'], 'js')
 		li = sorted(li, key=lambda x: int(x.split('.')[0]))
 		return render_template("list_files_all.html", title="INDEX [all]", prefix="/index/", files=li)
@app.route('/index/<path:fn>', methods = ['GET', 'POST'])
 def indexfn(fn):
 	if request.method == 'GET':
 		data = read_index(config.index['path'], fn)
 		if data is not None:
 			return render_template("indx.html", fn=fn, selected=data['selected'], orphan=data['orphan'])
 		else:
 			return "File: " + fn + "does not exist."
 	elif request.method == 'POST':
 		data = request.form		
 		a = data.get('action')
 		if a == "add":
 			logging.info("POST ADD " + fn + " -- " + data.get('kw') + " ++ " + data.get('list'))
 			if modify_selected_kw_index(config.index['path'], fn, data.get('kw')):
 				return "ok"
 		elif a == "delete":
 			logging.info("POST DELETE " + fn + " -- " + data.get('kw') + " ++ " + data.get('list'))
 			if modify_selected_kw_index(config.index['path'], fn, data.get('kw'), action="delete"):
 				return "ok"
 		return "-"
 '''
 		XML
 '''
 def read_xml(d, fn):
 	fp = os.path.join(d, fn)
 	if not os.path.isfile(fp):
@ -97,17 +204,12 @@ def delete_nbr_xml(d, fn, nbr, date):
 	tr.write(fp)
 	return True
-
+@app.route('/xml', methods = ['GET'])
@app.route('/')
 def index():
 	return render_template("index.html")
@app.route('/xml', methods = ['GET', 'POST'])
 def xml():
 	if request.method == 'GET':
 		li = list_all(config.xml['path'], 'xml')
 		li = sorted(li, key=lambda x: int(x.split('.')[0]))
-		return render_template("xml_all.html", files=li)
+		return render_template("list_files_all.html", title="XML [all]", prefix="/xml/", files=li)
@app.route('/xml/<path:fn>', methods = ['GET', 'POST'])
 def xmlfn(fn):
@ -129,6 +231,3 @@ def xmlfn(fn):
 			if delete_nbr_xml(config.xml['path'], fn, data.get('nbr'), data.get('date')):
 				return "ok"
 		return "-"
--- a/www/static/indx.js
+++ b/www/static/indx.js
@ -0,0 +1,10 @@
 $(document).ready(function(){
 	$('.add, .delete').click(function(e) {
 		var li = $(this).parent("li");
 		$.post('/index/' + li.data("file"), {'action': $(this).attr('class'), 'kw': li.data("kw"), 'list': li.data("list")}, function(d) {
 			if(d === 'ok') {				
 				location.reload();
 			}	
 		});
 	});	
 });
--- a/www/templates/indx.html
+++ b/www/templates/indx.html
@ -0,0 +1,28 @@
 <html>
 <head>
 	<meta charset="utf-8">
 	<title>{{fn}}</title>
   <script type="text/javascript" src="{{ url_for('static',filename='jquery-3.2.1.min.js') }}" charset="utf-8"></script>	
 	<script type="text/javascript" src="{{ url_for('static',filename='indx.js') }}"></script>
 </head>
 <body>
 <h1>{{fn}}</h1>
 <div id="all">
   <h2>Selected</h2>
   <ul>
   {% for kw, s in selected.items() %}
   <li data-file="{{fn}}" data-kw="{{kw}}" data-list="selected">{{kw}} {% for ss in s %} - {{ss}} {% endfor %}<button class="delete">-</button></li>
   {% endfor %}	
   </ul>
   <hr>
   <hr>
   <hr>
   <h2>Orphans</h2>
   <ul>
   {% for kw, s in orphan.items() %}
   <li data-file="{{fn}}" data-kw="{{kw}}" data-list="orphan">{{kw}} {% for ss in s %} - {{ss}} {% endfor %}<button class="add">+</button></li>
   {% endfor %}   
   </ul>   
 </div>
 </body>
 </html>
--- a/www/templates/list_files_all.html
+++ b/www/templates/list_files_all.html
@ -1,14 +1,14 @@
 <html>
 <head>
 	<meta charset="utf-8">
-	<title>XML [all]</title>
+	<title>{{title}}</title>
 </head>
 <body>
-<h1>XML [all]</h1>
+<h1>{{title}}</h1>
 <div id="all">
   <ul>
   {% for f in files %}
-   <li><a href="/xml/{{f}}">{{f}}</a></li>
+   <li><a href="{{prefix}}{{f}}">{{f}}</a></li>
   {% endfor %}	
   </ul>
 </div>
--- a/xml/9.List_talking_to_List.xml
+++ b/xml/9.List_talking_to_List.xml