index(es)

2020-01-21 11:38:31 +01:00
parent cabfe50777
commit afc71795d1
26 changed files with 151318 additions and 13 deletions
@@ -0,0 +1,75 @@
+import argparse, os, sys, json, logging
+from keywords import rake
+from lxml import etree as et
+
+import yake
+
+logging.basicConfig(level=logging.DEBUG)
+
+def index(f):
+
+	if not os.path.isfile(f):
+		logging.error(f + " is not a valid file.")
+		return None
+
+	# filename should be of the type: N.xxxx.xml
+	#ex: 3.Network.xml
+	ch = os.path.basename(f).split('.')[0]
+
+	indx = {}
+
+	r = rake.Rake('keywords/FoxStoplist.txt', max_words_length=3, min_keyword_frequency=1)
+
+	y = yake.KeywordExtractor(lan="en", top=40)
+
+	root = et.parse(f).getroot()
+
+	for m in root.findall('mails/mail'):
+		
+		nbr_str = m.find('nbr').text
+		content = m.find('content').text
+
+		# format nbr
+		nbr_str = ch + '.' + nbr_str
+
+		# yake
+		try:
+			kwy = y.extract_keywords(content)
+			for k in kwy:
+				kw = k[0]
+				if kw not in indx:
+					indx[kw] = []
+				indx[kw].append(nbr_str)
+		except Exception as e:
+			print(e)
+
+		# rake
+		try:
+			kwr = r.run(content)
+			kwr = [x for x in kwr if x[1] > 4.0]
+			for k in kwr:
+				kw = k[0]
+				if kw not in indx:
+					indx[kw] = []
+				indx[kw].append(nbr_str)
+		except Exception as e:
+			print(e)
+
+	return indx
+
+
+
+if __name__ == '__main__':
+
+	p = argparse.ArgumentParser(description='Builds an index of emails')
+	p.add_argument('file', metavar="f", help="xml file to index")
+
+	args = p.parse_args()
+
+	ind = index(args.file)
+	ind_out = {'selected': {}, 'orphan': ind}
+
+	print(json.dumps(ind_out, indent=4, sort_keys=True, ensure_ascii=False))
+
+	
+
@@ -0,0 +1,34 @@
+#!/bin/sh
+
+echo "1.Welcome.xml -> 1.Welcome-index.js"
+python index.py xml/1.Welcome.xml > index/1.Welcome-index.js
+echo "2.DeepEurope.xml -> 2.DeepEurope-index.js"
+python index.py xml/2.DeepEurope.xml > index/2.DeepEurope-index.js
+echo "3.Network.xml -> 3.Network-index.js"
+python index.py xml/3.Network.xml > index/3.Network-index.js
+echo "4.net.art.xml -> 4.net.art-index.js"
+python index.py xml/4.net.art.xml > index/4.net.art-index.js
+echo "5.Netzkritik.xml -> 5.Netzkritik-index.js"
+python index.py xml/5.Netzkritik.xml > index/5.Netzkritik-index.js
+echo "6.FLOSS.xml -> 6.FLOSS-index.js"
+python index.py xml/6.FLOSS.xml > index/6.FLOSS-index.js
+echo "7.Critique_Art_Politics.xml -> 7.Critique_Art_Politics-index.js"
+python index.py xml/7.Critique_Art_Politics.xml > index/7.Critique_Art_Politics-index.js
+echo "8.TacticalMedia.xml -> 8.TacticalMedia-index.js"
+python index.py xml/8.TacticalMedia.xml > index/8.TacticalMedia-index.js
+echo "9.List_talking_to_List.xml -> 9.List_talking_to_List-index.js"
+python index.py xml/9.List_talking_to_List.xml > index/9.List_talking_to_List-index.js
+echo "10.Cyberfeminism.xml -> 10.Cyberfeminism-index.js"
+python index.py xml/10.Cyberfeminism.xml > index/10.Cyberfeminism-index.js
+echo "11.CODE.xml -> 11.CODE-index.js"
+python index.py xml/11.CODE.xml > index/11.CODE-index.js
+echo "13.Post-digital.xml -> 13.Post-digital-index.js"
+python index.py xml/13.Post-digital.xml > index/13.Post-digital-index.js
+echo "14.MANIFESTO.xml -> 14.MANIFESTO-index.js"
+python index.py xml/14.MANIFESTO.xml > index/14.MANIFESTO-index.js
+echo "15.LutherBlissett.xml -> 15.LutherBlissett-index.js"
+python index.py xml/15.LutherBlissett.xml > index/15.LutherBlissett-index.js
+echo "16.NN.xml -> 16.NN-index.js"
+python index.py xml/16.NN.xml > index/16.NN-index.js
+echo "17.Interviews.xml -> 17.Interviews-index.js"
+python index.py xml/17.Interviews.xml > index/17.Interviews-index.js
@@ -0,0 +1,426 @@
+#From "A stop list for general text" Fox 1989
+a
+about
+above
+across
+after
+again
+against
+all
+almost
+alone
+along
+already
+also
+although
+always
+among
+an
+and
+another
+any
+anybody
+anyone
+anything
+anywhere
+are
+area
+areas
+around
+as
+ask
+asked
+asking
+asks
+at
+away
+b
+back
+backed
+backing
+backs
+be
+because
+became
+become
+becomes
+been
+before
+began
+behind
+being
+beings
+best
+better
+between
+big
+both
+but
+by
+c
+came
+can
+cannot
+case
+cases
+certain
+certainly
+clear
+clearly
+come
+could
+d
+did
+differ
+different
+differently
+do
+does
+done
+down
+downed
+downing
+downs
+during
+e
+each
+early
+either
+end
+ended
+ending
+ends
+enough
+even
+evenly
+ever
+every
+everybody
+everyone
+everything
+everywhere
+f
+face
+faces
+fact
+facts
+far
+felt
+few
+find
+finds
+first
+for
+four
+from
+full
+fully
+further
+furthered
+furthering
+furthers
+g
+gave
+general
+generally
+get
+gets
+give
+given
+gives
+go
+going
+good
+goods
+got
+great
+greater
+greatest
+group
+grouped
+grouping
+groups
+h
+had
+has
+have
+having
+he
+her
+herself
+here
+high
+higher
+highest
+him
+himself
+his
+how
+however
+i
+if
+important
+in
+interest
+interested
+interesting
+interests
+into
+is
+it
+its
+itself
+j
+just
+k
+keep
+keeps
+kind
+knew
+know
+known
+knows
+l
+large
+largely
+last
+later
+latest
+least
+less
+let
+lets
+like
+likely
+long
+longer
+longest
+m
+made
+make
+making
+man
+many
+may
+me
+member
+members
+men
+might
+more
+most
+mostly
+mr
+mrs
+much
+must
+my
+myself
+n
+necessary
+need
+needed
+needing
+needs
+never
+new
+newer
+newest
+next
+no
+non
+not
+nobody
+noone
+nothing
+now
+nowhere
+number
+numbered
+numbering
+numbers
+o
+of
+off
+often
+old
+older
+oldest
+on
+once
+one
+only
+open
+opened
+opening
+opens
+or
+order
+ordered
+ordering
+orders
+other
+others
+our
+out
+over
+p
+part
+parted
+parting
+parts
+per
+perhaps
+place
+places
+point
+pointed
+pointing
+points
+possible
+present
+presented
+presenting
+presents
+problem
+problems
+put
+puts
+q
+quite
+r
+rather
+really
+right
+room
+rooms
+s
+said
+same
+saw
+say
+says
+second
+seconds
+see
+seem
+seemed
+seeming
+seems
+sees
+several
+shall
+she
+should
+show
+showed
+showing
+shows
+side
+sides
+since
+small
+smaller
+smallest
+so
+some
+somebody
+someone
+something
+somewhere
+state
+states
+still
+such
+sure
+t
+take
+taken
+than
+that
+the
+their
+them
+then
+there
+therefore
+these
+they
+thing
+things
+think
+thinks
+this
+those
+though
+thought
+thoughts
+three
+through
+thus
+to
+today
+together
+too
+took
+toward
+turn
+turned
+turning
+turns
+two
+u
+under
+until
+up
+upon
+us
+use
+uses
+used
+v
+very
+w
+want
+wanted
+wanting
+wants
+was
+way
+ways
+we
+well
+wells
+went
+were
+what
+when
+where
+whether
+which
+while
+who
+whole
+whose
+why
+will
+with
+within
+without
+work
+worked
+working
+works
+would
+x
+y
+year
+years
+yet
+you
+young
+younger
+youngest
+your
+yours
+z
@@ -0,0 +1,168 @@
+# Implementation of RAKE - Rapid Automtic Keyword Exraction algorithm
+# as described in:
+# Rose, S., D. Engel, N. Cramer, and W. Cowley (2010). 
+# Automatic keyword extraction from indi-vidual documents. 
+# In M. W. Berry and J. Kogan (Eds.), Text Mining: Applications and Theory.unknown: John Wiley and Sons, Ltd.
+
+import re
+import operator
+
+debug = False
+test = True
+
+
+def is_number(s):
+    try:
+        float(s) if '.' in s else int(s)
+        return True
+    except ValueError:
+        return False
+
+
+def load_stop_words(stop_word_file):
+    """
+    Utility function to load stop words from a file and return as a list of words
+    @param stop_word_file Path and file name of a file containing stop words.
+    @return list A list of stop words.
+    """
+    stop_words = []
+    for line in open(stop_word_file):
+        if line.strip()[0:1] != "#":
+            for word in line.split():  # in case more than one per line
+                stop_words.append(word)
+    return stop_words
+
+
+def separate_words(text, min_word_return_size):
+    """
+    Utility function to return a list of all words that are have a length greater than a specified number of characters.
+    @param text The text that must be split in to words.
+    @param min_word_return_size The minimum no of characters a word must have to be included.
+    """
+    splitter = re.compile('[^a-zA-Z0-9_\\+\\-/]')
+    words = []
+    for single_word in splitter.split(text):
+        current_word = single_word.strip().lower()
+        #leave numbers in phrase, but don't count as words, since they tend to invalidate scores of their phrases
+        if len(current_word) > min_word_return_size and current_word != '' and not is_number(current_word):
+            words.append(current_word)
+    return words
+
+
+def split_sentences(text):
+    """
+    Utility function to return a list of sentences.
+    @param text The text that must be split in to sentences.
+    """
+    sentence_delimiters = re.compile(u'[.!?,;:\t\\\\"\\(\\)\\\'\u2019\u2013]|\\s\\-\\s')
+    sentences = sentence_delimiters.split(text)
+    return sentences
+
+
+def build_stop_word_regex(stop_word_file_path):
+    stop_word_list = load_stop_words(stop_word_file_path)
+    stop_word_regex_list = []
+    for word in stop_word_list:
+        word_regex = r'\b' + word + r'(?![\w-])'  # added look ahead for hyphen
+        stop_word_regex_list.append(word_regex)
+    stop_word_pattern = re.compile('|'.join(stop_word_regex_list), re.IGNORECASE)
+    return stop_word_pattern
+
+
+def generate_candidate_keywords(sentence_list, stopword_pattern):
+    phrase_list = []
+    for s in sentence_list:
+        tmp = re.sub(stopword_pattern, '|', s.strip())
+        phrases = tmp.split("|")
+        for phrase in phrases:
+            phrase = phrase.strip().lower()
+            if phrase != "":
+                phrase_list.append(phrase)
+    return phrase_list
+
+
+def calculate_word_scores(phraseList):
+    word_frequency = {}
+    word_degree = {}
+    for phrase in phraseList:
+        word_list = separate_words(phrase, 0)
+        word_list_length = len(word_list)
+        word_list_degree = word_list_length - 1
+        #if word_list_degree > 3: word_list_degree = 3 #exp.
+        for word in word_list:
+            word_frequency.setdefault(word, 0)
+            word_frequency[word] += 1
+            word_degree.setdefault(word, 0)
+            word_degree[word] += word_list_degree  #orig.
+            #word_degree[word] += 1/(word_list_length*1.0) #exp.
+    for item in word_frequency:
+        word_degree[item] = word_degree[item] + word_frequency[item]
+
+    # Calculate Word scores = deg(w)/frew(w)
+    word_score = {}
+    for item in word_frequency:
+        word_score.setdefault(item, 0)
+        word_score[item] = word_degree[item] / (word_frequency[item] * 1.0)  #orig.
+    #word_score[item] = word_frequency[item]/(word_degree[item] * 1.0) #exp.
+    return word_score
+
+
+def generate_candidate_keyword_scores(phrase_list, word_score):
+    keyword_candidates = {}
+    for phrase in phrase_list:
+        keyword_candidates.setdefault(phrase, 0)
+        word_list = separate_words(phrase, 0)
+        candidate_score = 0
+        for word in word_list:
+            candidate_score += word_score[word]
+        keyword_candidates[phrase] = candidate_score
+    return keyword_candidates
+
+
+class Rake(object):
+    def __init__(self, stop_words_path):
+        self.stop_words_path = stop_words_path
+        self.__stop_words_pattern = build_stop_word_regex(stop_words_path)
+
+    def run(self, text):
+        sentence_list = split_sentences(text)
+
+        phrase_list = generate_candidate_keywords(sentence_list, self.__stop_words_pattern)
+
+        word_scores = calculate_word_scores(phrase_list)
+
+        keyword_candidates = generate_candidate_keyword_scores(phrase_list, word_scores)
+
+        sorted_keywords = sorted(keyword_candidates.items(), key=lambda x: x[1], reverse=True)
+        return sorted_keywords
+
+
+# if test:
+#     text = "Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types."
+
+#     # Split text into sentences
+#     sentenceList = split_sentences(text)
+#     #stoppath = "FoxStoplist.txt" #Fox stoplist contains "numbers", so it will not find "natural numbers" like in Table 1.1
+#     stoppath = "SmartStoplist.txt"  #SMART stoplist misses some of the lower-scoring keywords in Figure 1.5, which means that the top 1/3 cuts off one of the 4.0 score words in Table 1.1
+#     stopwordpattern = build_stop_word_regex(stoppath)
+
+#     # generate candidate keywords
+#     phraseList = generate_candidate_keywords(sentenceList, stopwordpattern)
+
+#     # calculate individual word scores
+#     wordscores = calculate_word_scores(phraseList)
+
+#     # generate candidate keyword scores
+#     keywordcandidates = generate_candidate_keyword_scores(phraseList, wordscores)
+#     if debug: print keywordcandidates
+
+#     sortedKeywords = sorted(keywordcandidates.iteritems(), key=operator.itemgetter(1), reverse=True)
+#     if debug: print sortedKeywords
+
+#     totalKeywords = len(sortedKeywords)
+#     if debug: print totalKeywords
+#     print sortedKeywords[0:(totalKeywords / 3)]
+
+#     rake = Rake("SmartStoplist.txt")
+#     keywords = rake.run(text)
+#     print keywords
@@ -0,0 +1,301 @@
+# Implementation of RAKE - Rapid Automatic Keyword Extraction algorithm
+# as described in:
+# Rose, S., D. Engel, N. Cramer, and W. Cowley (2010).
+# Automatic keyword extraction from individual documents.
+# In M. W. Berry and J. Kogan (Eds.), Text Mining: Applications and Theory.unknown: John Wiley and Sons, Ltd.
+#
+# NOTE: The original code (from https://github.com/aneesha/RAKE)
+# has been extended by a_medelyan (zelandiya)
+# with a set of heuristics to decide whether a phrase is an acceptable candidate
+# as well as the ability to set frequency and phrase length parameters
+# important when dealing with longer documents
+#
+# NOTE 2: The code published by a_medelyan (https://github.com/zelandiya/RAKE-tutorial)
+# has been additionally extended by Marco Pegoraro to implement the adjoined candidate
+# feature described in section 1.2.3 of the original paper. Note that this creates the
+# need to modify the metric for the candidate score, because the adjoined candidates
+# have a very high score (because of the nature of the original score metric)
+
+from __future__ import absolute_import
+from __future__ import print_function
+import re
+import operator
+# import six
+# from six.moves import range
+from collections import Counter
+
+debug = False
+test = False
+
+
+def is_number(s):
+    try:
+        float(s) if '.' in s else int(s)
+        return True
+    except ValueError:
+        return False
+
+
+def load_stop_words(stop_word_file):
+    """
+    Utility function to load stop words from a file and return as a list of words
+    @param stop_word_file Path and file name of a file containing stop words.
+    @return list A list of stop words.
+    """
+    stop_words = []
+    for line in open(stop_word_file):
+        if line.strip()[0:1] != "#":
+            for word in line.split():  # in case more than one per line
+                stop_words.append(word)
+    return stop_words
+
+
+def separate_words(text, min_word_return_size):
+    """
+    Utility function to return a list of all words that are have a length greater than a specified number of characters.
+    @param text The text that must be split in to words.
+    @param min_word_return_size The minimum no of characters a word must have to be included.
+    """
+    splitter = re.compile('[^a-zA-Z0-9_\\+\\-/]')
+    words = []
+    for single_word in splitter.split(text):
+        current_word = single_word.strip().lower()
+        # leave numbers in phrase, but don't count as words, since they tend to invalidate scores of their phrases
+        if len(current_word) > min_word_return_size and current_word != '' and not is_number(current_word):
+            words.append(current_word)
+    return words
+
+
+def split_sentences(text):
+    """
+    Utility function to return a list of sentences.
+    @param text The text that must be split in to sentences.
+    """
+    sentence_delimiters = re.compile(u'[\\[\\]\n.!?,;:\t\\-\\"\\(\\)\\\'\u2019\u2013]')
+    sentences = sentence_delimiters.split(text)
+    return sentences
+
+
+def build_stop_word_regex(stop_word_list):
+    stop_word_regex_list = []
+    for word in stop_word_list:
+        word_regex = '\\b' + word + '\\b'
+        stop_word_regex_list.append(word_regex)
+    stop_word_pattern = re.compile('|'.join(stop_word_regex_list), re.IGNORECASE)
+    return stop_word_pattern
+
+
+#
+# Function that extracts the adjoined candidates from a list of sentences and filters them by frequency
+#
+def extract_adjoined_candidates(sentence_list, stoplist, min_keywords, max_keywords, min_freq):
+    adjoined_candidates = []
+    for s in sentence_list:
+        # Extracts the candidates from each single sentence and adds them to the list
+        adjoined_candidates += adjoined_candidates_from_sentence(s, stoplist, min_keywords, max_keywords)
+    # Filters the candidates and returns them
+    return filter_adjoined_candidates(adjoined_candidates, min_freq)
+
+
+# return adjoined_candidates
+
+#
+# Function that extracts the adjoined candidates from a single sentence
+#
+def adjoined_candidates_from_sentence(s, stoplist, min_keywords, max_keywords):
+    # Initializes the candidate list to empty
+    candidates = []
+    # Splits the sentence to get a list of lowercase words
+    sl = s.lower().split()
+    # For each possible length of the adjoined candidate
+    for num_keywords in range(min_keywords, max_keywords + 1):
+        # Until the third-last word
+        for i in range(0, len(sl) - num_keywords):
+            # Position i marks the first word of the candidate. Proceeds only if it's not a stopword
+            if sl[i] not in stoplist:
+                candidate = sl[i]
+                # Initializes j (the pointer to the next word) to 1
+                j = 1
+                # Initializes the word counter. This counts the non-stopwords words in the candidate
+                keyword_counter = 1
+                contains_stopword = False
+                # Until the word count reaches the maximum number of keywords or the end is reached
+                while keyword_counter < num_keywords and i + j < len(sl):
+                    # Adds the next word to the candidate
+                    candidate = candidate + ' ' + sl[i + j]
+                    # If it's not a stopword, increase the word counter. If it is, turn on the flag
+                    if sl[i + j] not in stoplist:
+                        keyword_counter += 1
+                    else:
+                        contains_stopword = True
+                    # Next position
+                    j += 1
+                # Adds the candidate to the list only if:
+                # 1) it contains at least a stopword (if it doesn't it's already been considered)
+                # AND
+                # 2) the last word is not a stopword
+                # AND
+                # 3) the adjoined candidate keyphrase contains exactly the correct number of keywords (to avoid doubles)
+                if contains_stopword and candidate.split()[-1] not in stoplist and keyword_counter == num_keywords:
+                    candidates.append(candidate)
+    return candidates
+
+
+#
+# Function that filters the adjoined candidates to keep only those that appears with a certain frequency
+#
+def filter_adjoined_candidates(candidates, min_freq):
+    # Creates a dictionary where the key is the candidate and the value is the frequency of the candidate
+    candidates_freq = Counter(candidates)
+    filtered_candidates = []
+    # Uses the dictionary to filter the candidates
+    for candidate in candidates:
+        freq = candidates_freq[candidate]
+        if freq >= min_freq:
+            filtered_candidates.append(candidate)
+    return filtered_candidates
+
+
+def generate_candidate_keywords(sentence_list, stopword_pattern, stop_word_list, min_char_length=1, max_words_length=5,
+                                min_words_length_adj=1, max_words_length_adj=1, min_phrase_freq_adj=2):
+    phrase_list = []
+    for s in sentence_list:
+        tmp = re.sub(stopword_pattern, '|', s.strip())
+        phrases = tmp.split("|")
+        for phrase in phrases:
+            phrase = phrase.strip().lower()
+            if phrase != "" and is_acceptable(phrase, min_char_length, max_words_length):
+                phrase_list.append(phrase)
+    phrase_list += extract_adjoined_candidates(sentence_list, stop_word_list, min_words_length_adj,
+                                               max_words_length_adj, min_phrase_freq_adj)
+    return phrase_list
+
+
+def is_acceptable(phrase, min_char_length, max_words_length):
+    # a phrase must have a min length in characters
+    if len(phrase) < min_char_length:
+        return 0
+
+    # a phrase must have a max number of words
+    words = phrase.split()
+    if len(words) > max_words_length:
+        return 0
+
+    digits = 0
+    alpha = 0
+    for i in range(0, len(phrase)):
+        if phrase[i].isdigit():
+            digits += 1
+        elif phrase[i].isalpha():
+            alpha += 1
+
+    # a phrase must have at least one alpha character
+    if alpha == 0:
+        return 0
+
+    # a phrase must have more alpha than digits characters
+    if digits > alpha:
+        return 0
+    return 1
+
+
+def calculate_word_scores(phraseList):
+    word_frequency = {}
+    word_degree = {}
+    for phrase in phraseList:
+        word_list = separate_words(phrase, 0)
+        word_list_length = len(word_list)
+        word_list_degree = word_list_length - 1
+        # if word_list_degree > 3: word_list_degree = 3 #exp.
+        for word in word_list:
+            word_frequency.setdefault(word, 0)
+            word_frequency[word] += 1
+            word_degree.setdefault(word, 0)
+            word_degree[word] += word_list_degree  # orig.
+            # word_degree[word] += 1/(word_list_length*1.0) #exp.
+    for item in word_frequency:
+        word_degree[item] = word_degree[item] + word_frequency[item]
+
+    # Calculate Word scores = deg(w)/frew(w)
+    word_score = {}
+    for item in word_frequency:
+        word_score.setdefault(item, 0)
+        word_score[item] = word_degree[item] / (word_frequency[item] * 1.0)  # orig.
+    # word_score[item] = word_frequency[item]/(word_degree[item] * 1.0) #exp.
+    return word_score
+
+
+def generate_candidate_keyword_scores(phrase_list, word_score, min_keyword_frequency=1):
+    keyword_candidates = {}
+    for phrase in phrase_list:
+        if min_keyword_frequency > 1:
+            if phrase_list.count(phrase) < min_keyword_frequency:
+                continue
+        keyword_candidates.setdefault(phrase, 0)
+        word_list = separate_words(phrase, 0)
+        candidate_score = 0
+        for word in word_list:
+            candidate_score += word_score[word]
+        keyword_candidates[phrase] = candidate_score
+    return keyword_candidates
+
+
+class Rake(object):
+    def __init__(self, stop_words_path, min_char_length=1, max_words_length=5, min_keyword_frequency=1,
+                 min_words_length_adj=1, max_words_length_adj=1, min_phrase_freq_adj=2):
+        self.__stop_words_path = stop_words_path
+        self.__stop_words_list = load_stop_words(stop_words_path)
+        self.__min_char_length = min_char_length
+        self.__max_words_length = max_words_length
+        self.__min_keyword_frequency = min_keyword_frequency
+        self.__min_words_length_adj = min_words_length_adj
+        self.__max_words_length_adj = max_words_length_adj
+        self.__min_phrase_freq_adj = min_phrase_freq_adj
+
+    def run(self, text):
+        sentence_list = split_sentences(text)
+
+        stop_words_pattern = build_stop_word_regex(self.__stop_words_list)
+
+        phrase_list = generate_candidate_keywords(sentence_list, stop_words_pattern, self.__stop_words_list,
+                                                  self.__min_char_length, self.__max_words_length,
+                                                  self.__min_words_length_adj, self.__max_words_length_adj,
+                                                  self.__min_phrase_freq_adj)
+
+        word_scores = calculate_word_scores(phrase_list)
+
+        keyword_candidates = generate_candidate_keyword_scores(phrase_list, word_scores, self.__min_keyword_frequency)
+
+        sorted_keywords = sorted(keyword_candidates.items(), key=lambda x: x[1], reverse=True)
+        return sorted_keywords
+
+
+# if test and __name__ == '__main__':
+#     text = "Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types."
+
+#     # Split text into sentences
+#     sentenceList = split_sentences(text)
+#     # stoppath = "FoxStoplist.txt" #Fox stoplist contains "numbers", so it will not find "natural numbers" like in Table 1.1
+#     stoppath = "data/stoplists/SmartStoplist.txt"  # SMART stoplist misses some of the lower-scoring keywords in Figure 1.5, which means that the top 1/3 cuts off one of the 4.0 score words in Table 1.1
+#     stopwordpattern = build_stop_word_regex(stoppath)
+
+#     # generate candidate keywords
+#     phraseList = generate_candidate_keywords(sentenceList, stopwordpattern, load_stop_words(stoppath))
+
+#     # calculate individual word scores
+#     wordscores = calculate_word_scores(phraseList)
+
+#     # generate candidate keyword scores
+#     keywordcandidates = generate_candidate_keyword_scores(phraseList, wordscores)
+#     if debug: print(keywordcandidates)
+
+#     sortedKeywords = sorted(six.iteritems(keywordcandidates), key=operator.itemgetter(1), reverse=True)
+#     if debug: print(sortedKeywords)
+
+#     totalKeywords = len(sortedKeywords)
+#     if debug: print(totalKeywords)
+#     print(sortedKeywords[0:(totalKeywords // 3)])
+
+#     rake = Rake("data/stoplists/SmartStoplist.txt")
+#     keywords = rake.run(text)
+#     print(keywords)
@@ -3,6 +3,7 @@ from www import app
 import json, logging, os, glob
 from lxml import etree as et
 import config
+from collections import OrderedDict

 def list_all(d, ext):

@@ -12,6 +13,112 @@ def list_all(d, ext):

 	return [os.path.basename(f) for f in glob.glob(os.path.join(d, "*." + ext))]

+@app.route('/')
+def top():
+	return render_template("index.html")
+
+'''
+		INDEX
+'''
+
+def read_index(d, fn):
+	fp = os.path.join(d, fn)
+	if not os.path.isfile(fp):
+		return None
+
+	with open(fp) as f:
+		index_data = json.load(f, object_pairs_hook=OrderedDict)
+
+	return index_data
+
+# def add_selected_kw_index(d, fn, kw):
+# 	fp = os.path.join(d, fn)
+# 	if not os.path.isfile(fp):
+# 		return False
+
+# 	with open(fp) as f:
+# 		index_data = json.load(f)
+
+# 	if kw not in index_data['orphan']:
+# 		return False
+
+# 	v = index_data['orphan'].pop(kw)
+# 	if kw not in index_data['selected']:
+# 		index_data['selected'][kw] = []
+
+# 	index_data['selected'][kw] += v
+
+# 	with open(fp, 'w') as fout:
+# 		json.dump(index_data, fout, indent=4, sort_keys=True, ensure_ascii=False)
+
+# 	return True
+
+
+def modify_selected_kw_index(d, fn, kw, action="add"):
+	fp = os.path.join(d, fn)
+	if not os.path.isfile(fp):
+		return False
+
+	with open(fp) as f:
+		index_data = json.load(f)
+
+	if action == 'add':
+		in_dic = index_data['selected']
+		out_dic = index_data['orphan']
+	elif action == 'delete':
+		out_dic = index_data['selected']
+		in_dic = index_data['orphan']
+	else:
+		return False
+
+	if kw not in out_dic:
+		return False
+
+	v = out_dic.pop(kw)
+	if kw not in in_dic:
+		in_dic[kw] = []
+
+	in_dic[kw] += v
+
+	with open(fp, 'w') as fout:
+		json.dump(index_data, fout, indent=4, sort_keys=True, ensure_ascii=False)
+
+	return True
+
+
+@app.route('/index', methods = ['GET'])
+def index():
+	if request.method == 'GET':
+		li = list_all(config.index['path'], 'js')
+		li = sorted(li, key=lambda x: int(x.split('.')[0]))
+		return render_template("list_files_all.html", title="INDEX [all]", prefix="/index/", files=li)
+
+@app.route('/index/<path:fn>', methods = ['GET', 'POST'])
+def indexfn(fn):
+	if request.method == 'GET':
+		data = read_index(config.index['path'], fn)
+		if data is not None:
+			return render_template("indx.html", fn=fn, selected=data['selected'], orphan=data['orphan'])
+		else:
+			return "File: " + fn + "does not exist."
+	elif request.method == 'POST':
+		data = request.form		
+		a = data.get('action')
+		if a == "add":
+			logging.info("POST ADD " + fn + " -- " + data.get('kw') + " ++ " + data.get('list'))
+			if modify_selected_kw_index(config.index['path'], fn, data.get('kw')):
+				return "ok"
+		elif a == "delete":
+			logging.info("POST DELETE " + fn + " -- " + data.get('kw') + " ++ " + data.get('list'))
+			if modify_selected_kw_index(config.index['path'], fn, data.get('kw'), action="delete"):
+				return "ok"
+		return "-"
+
+
+'''
+		XML
+'''
+
 def read_xml(d, fn):
 	fp = os.path.join(d, fn)
 	if not os.path.isfile(fp):
@@ -97,17 +204,12 @@ def delete_nbr_xml(d, fn, nbr, date):
 	tr.write(fp)
 	return True

-
-@app.route('/')
-def index():
-	return render_template("index.html")
-
-@app.route('/xml', methods = ['GET', 'POST'])
+@app.route('/xml', methods = ['GET'])
 def xml():
 	if request.method == 'GET':
 		li = list_all(config.xml['path'], 'xml')
 		li = sorted(li, key=lambda x: int(x.split('.')[0]))
-		return render_template("xml_all.html", files=li)
+		return render_template("list_files_all.html", title="XML [all]", prefix="/xml/", files=li)

@app.route('/xml/<path:fn>', methods = ['GET', 'POST'])
 def xmlfn(fn):
@@ -129,6 +231,3 @@ def xmlfn(fn):
 			if delete_nbr_xml(config.xml['path'], fn, data.get('nbr'), data.get('date')):
 				return "ok"
 		return "-"
-
-		
-
@@ -0,0 +1,10 @@
+$(document).ready(function(){
+	$('.add, .delete').click(function(e) {
+		var li = $(this).parent("li");
+		$.post('/index/' + li.data("file"), {'action': $(this).attr('class'), 'kw': li.data("kw"), 'list': li.data("list")}, function(d) {
+			if(d === 'ok') {				
+				location.reload();
+			}	
+		});
+	});	
+});
@@ -0,0 +1,28 @@
+<html>
+<head>
+	<meta charset="utf-8">
+	<title>{{fn}}</title>
+   <script type="text/javascript" src="{{ url_for('static',filename='jquery-3.2.1.min.js') }}" charset="utf-8"></script>	
+	<script type="text/javascript" src="{{ url_for('static',filename='indx.js') }}"></script>
+</head>
+<body>
+<h1>{{fn}}</h1>
+<div id="all">
+   <h2>Selected</h2>
+   <ul>
+   {% for kw, s in selected.items() %}
+   <li data-file="{{fn}}" data-kw="{{kw}}" data-list="selected">{{kw}} {% for ss in s %} - {{ss}} {% endfor %}<button class="delete">-</button></li>
+   {% endfor %}	
+   </ul>
+   <hr>
+   <hr>
+   <hr>
+   <h2>Orphans</h2>
+   <ul>
+   {% for kw, s in orphan.items() %}
+   <li data-file="{{fn}}" data-kw="{{kw}}" data-list="orphan">{{kw}} {% for ss in s %} - {{ss}} {% endfor %}<button class="add">+</button></li>
+   {% endfor %}   
+   </ul>   
+</div>
+</body>
+</html>
@@ -1,14 +1,14 @@
 <html>
 <head>
 	<meta charset="utf-8">
-	<title>XML [all]</title>
+	<title>{{title}}</title>
 </head>
 <body>
-<h1>XML [all]</h1>
+<h1>{{title}}</h1>
 <div id="all">
   <ul>
   {% for f in files %}
-   <li><a href="/xml/{{f}}">{{f}}</a></li>
+   <li><a href="{{prefix}}{{f}}">{{f}}</a></li>
   {% endfor %}	
   </ul>
 </div>