Full_digest_rescheduled/keywords/oldrake.py

# Implementation of RAKE - Rapid Automtic Keyword Exraction algorithm
# as described in:
# Rose, S., D. Engel, N. Cramer, and W. Cowley (2010). 
# Automatic keyword extraction from indi-vidual documents. 
# In M. W. Berry and J. Kogan (Eds.), Text Mining: Applications and Theory.unknown: John Wiley and Sons, Ltd.

import re
import operator

debug = False
test = True


def is_number(s):
    try:
        float(s) if '.' in s else int(s)
        return True
    except ValueError:
        return False


def load_stop_words(stop_word_file):
    """
    Utility function to load stop words from a file and return as a list of words
    @param stop_word_file Path and file name of a file containing stop words.
    @return list A list of stop words.
    """
    stop_words = []
    for line in open(stop_word_file):
        if line.strip()[0:1] != "#":
            for word in line.split():  # in case more than one per line
                stop_words.append(word)
    return stop_words


def separate_words(text, min_word_return_size):
    """
    Utility function to return a list of all words that are have a length greater than a specified number of characters.
    @param text The text that must be split in to words.
    @param min_word_return_size The minimum no of characters a word must have to be included.
    """
    splitter = re.compile('[^a-zA-Z0-9_\\+\\-/]')
    words = []
    for single_word in splitter.split(text):
        current_word = single_word.strip().lower()
        #leave numbers in phrase, but don't count as words, since they tend to invalidate scores of their phrases
        if len(current_word) > min_word_return_size and current_word != '' and not is_number(current_word):
            words.append(current_word)
    return words


def split_sentences(text):
    """
    Utility function to return a list of sentences.
    @param text The text that must be split in to sentences.
    """
    sentence_delimiters = re.compile(u'[.!?,;:\t\\\\"\\(\\)\\\'\u2019\u2013]|\\s\\-\\s')
    sentences = sentence_delimiters.split(text)
    return sentences


def build_stop_word_regex(stop_word_file_path):
    stop_word_list = load_stop_words(stop_word_file_path)
    stop_word_regex_list = []
    for word in stop_word_list:
        word_regex = r'\b' + word + r'(?![\w-])'  # added look ahead for hyphen
        stop_word_regex_list.append(word_regex)
    stop_word_pattern = re.compile('|'.join(stop_word_regex_list), re.IGNORECASE)
    return stop_word_pattern


def generate_candidate_keywords(sentence_list, stopword_pattern):
    phrase_list = []
    for s in sentence_list:
        tmp = re.sub(stopword_pattern, '|', s.strip())
        phrases = tmp.split("|")
        for phrase in phrases:
            phrase = phrase.strip().lower()
            if phrase != "":
                phrase_list.append(phrase)
    return phrase_list


def calculate_word_scores(phraseList):
    word_frequency = {}
    word_degree = {}
    for phrase in phraseList:
        word_list = separate_words(phrase, 0)
        word_list_length = len(word_list)
        word_list_degree = word_list_length - 1
        #if word_list_degree > 3: word_list_degree = 3 #exp.
        for word in word_list:
            word_frequency.setdefault(word, 0)
            word_frequency[word] += 1
            word_degree.setdefault(word, 0)
            word_degree[word] += word_list_degree  #orig.
            #word_degree[word] += 1/(word_list_length*1.0) #exp.
    for item in word_frequency:
        word_degree[item] = word_degree[item] + word_frequency[item]

    # Calculate Word scores = deg(w)/frew(w)
    word_score = {}
    for item in word_frequency:
        word_score.setdefault(item, 0)
        word_score[item] = word_degree[item] / (word_frequency[item] * 1.0)  #orig.
    #word_score[item] = word_frequency[item]/(word_degree[item] * 1.0) #exp.
    return word_score


def generate_candidate_keyword_scores(phrase_list, word_score):
    keyword_candidates = {}
    for phrase in phrase_list:
        keyword_candidates.setdefault(phrase, 0)
        word_list = separate_words(phrase, 0)
        candidate_score = 0
        for word in word_list:
            candidate_score += word_score[word]
        keyword_candidates[phrase] = candidate_score
    return keyword_candidates


class Rake(object):
    def __init__(self, stop_words_path):
        self.stop_words_path = stop_words_path
        self.__stop_words_pattern = build_stop_word_regex(stop_words_path)

    def run(self, text):
        sentence_list = split_sentences(text)

        phrase_list = generate_candidate_keywords(sentence_list, self.__stop_words_pattern)

        word_scores = calculate_word_scores(phrase_list)

        keyword_candidates = generate_candidate_keyword_scores(phrase_list, word_scores)

        sorted_keywords = sorted(keyword_candidates.items(), key=lambda x: x[1], reverse=True)
        return sorted_keywords


# if test:
#     text = "Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types."

#     # Split text into sentences
#     sentenceList = split_sentences(text)
#     #stoppath = "FoxStoplist.txt" #Fox stoplist contains "numbers", so it will not find "natural numbers" like in Table 1.1
#     stoppath = "SmartStoplist.txt"  #SMART stoplist misses some of the lower-scoring keywords in Figure 1.5, which means that the top 1/3 cuts off one of the 4.0 score words in Table 1.1
#     stopwordpattern = build_stop_word_regex(stoppath)

#     # generate candidate keywords
#     phraseList = generate_candidate_keywords(sentenceList, stopwordpattern)

#     # calculate individual word scores
#     wordscores = calculate_word_scores(phraseList)

#     # generate candidate keyword scores
#     keywordcandidates = generate_candidate_keyword_scores(phraseList, wordscores)
#     if debug: print keywordcandidates

#     sortedKeywords = sorted(keywordcandidates.iteritems(), key=operator.itemgetter(1), reverse=True)
#     if debug: print sortedKeywords

#     totalKeywords = len(sortedKeywords)
#     if debug: print totalKeywords
#     print sortedKeywords[0:(totalKeywords / 3)]

#     rake = Rake("SmartStoplist.txt")
#     keywords = rake.run(text)
#     print keywords
index(es) 2020-01-21 11:38:31 +01:00			`# Implementation of RAKE - Rapid Automtic Keyword Exraction algorithm`
			`# as described in:`
			`# Rose, S., D. Engel, N. Cramer, and W. Cowley (2010).`
			`# Automatic keyword extraction from indi-vidual documents.`
			`# In M. W. Berry and J. Kogan (Eds.), Text Mining: Applications and Theory.unknown: John Wiley and Sons, Ltd.`

			`import re`
			`import operator`

			`debug = False`
			`test = True`


			`def is_number(s):`
			`try:`
			`float(s) if '.' in s else int(s)`
			`return True`
			`except ValueError:`
			`return False`


			`def load_stop_words(stop_word_file):`
			`"""`
			`Utility function to load stop words from a file and return as a list of words`
			`@param stop_word_file Path and file name of a file containing stop words.`
			`@return list A list of stop words.`
			`"""`
			`stop_words = []`
			`for line in open(stop_word_file):`
			`if line.strip()[0:1] != "#":`
			`for word in line.split(): # in case more than one per line`
			`stop_words.append(word)`
			`return stop_words`


			`def separate_words(text, min_word_return_size):`
			`"""`
			`Utility function to return a list of all words that are have a length greater than a specified number of characters.`
			`@param text The text that must be split in to words.`
			`@param min_word_return_size The minimum no of characters a word must have to be included.`
			`"""`
			`splitter = re.compile('[^a-zA-Z0-9_\\+\\-/]')`
			`words = []`
			`for single_word in splitter.split(text):`
			`current_word = single_word.strip().lower()`
			`#leave numbers in phrase, but don't count as words, since they tend to invalidate scores of their phrases`
			`if len(current_word) > min_word_return_size and current_word != '' and not is_number(current_word):`
			`words.append(current_word)`
			`return words`


			`def split_sentences(text):`
			`"""`
			`Utility function to return a list of sentences.`
			`@param text The text that must be split in to sentences.`
			`"""`
			`sentence_delimiters = re.compile(u'[.!?,;:\t\\\\"\\(\\)\\\'\u2019\u2013]\|\\s\\-\\s')`
			`sentences = sentence_delimiters.split(text)`
			`return sentences`


			`def build_stop_word_regex(stop_word_file_path):`
			`stop_word_list = load_stop_words(stop_word_file_path)`
			`stop_word_regex_list = []`
			`for word in stop_word_list:`
			`word_regex = r'\b' + word + r'(?![\w-])' # added look ahead for hyphen`
			`stop_word_regex_list.append(word_regex)`
			`stop_word_pattern = re.compile('\|'.join(stop_word_regex_list), re.IGNORECASE)`
			`return stop_word_pattern`


			`def generate_candidate_keywords(sentence_list, stopword_pattern):`
			`phrase_list = []`
			`for s in sentence_list:`
			`tmp = re.sub(stopword_pattern, '\|', s.strip())`
			`phrases = tmp.split("\|")`
			`for phrase in phrases:`
			`phrase = phrase.strip().lower()`
			`if phrase != "":`
			`phrase_list.append(phrase)`
			`return phrase_list`


			`def calculate_word_scores(phraseList):`
			`word_frequency = {}`
			`word_degree = {}`
			`for phrase in phraseList:`
			`word_list = separate_words(phrase, 0)`
			`word_list_length = len(word_list)`
			`word_list_degree = word_list_length - 1`
			`#if word_list_degree > 3: word_list_degree = 3 #exp.`
			`for word in word_list:`
			`word_frequency.setdefault(word, 0)`
			`word_frequency[word] += 1`
			`word_degree.setdefault(word, 0)`
			`word_degree[word] += word_list_degree #orig.`
			`#word_degree[word] += 1/(word_list_length*1.0) #exp.`
			`for item in word_frequency:`
			`word_degree[item] = word_degree[item] + word_frequency[item]`

			`# Calculate Word scores = deg(w)/frew(w)`
			`word_score = {}`
			`for item in word_frequency:`
			`word_score.setdefault(item, 0)`
			`word_score[item] = word_degree[item] / (word_frequency[item] * 1.0) #orig.`
			`#word_score[item] = word_frequency[item]/(word_degree[item] * 1.0) #exp.`
			`return word_score`


			`def generate_candidate_keyword_scores(phrase_list, word_score):`
			`keyword_candidates = {}`
			`for phrase in phrase_list:`
			`keyword_candidates.setdefault(phrase, 0)`
			`word_list = separate_words(phrase, 0)`
			`candidate_score = 0`
			`for word in word_list:`
			`candidate_score += word_score[word]`
			`keyword_candidates[phrase] = candidate_score`
			`return keyword_candidates`


			`class Rake(object):`
			`def __init__(self, stop_words_path):`
			`self.stop_words_path = stop_words_path`
			`self.__stop_words_pattern = build_stop_word_regex(stop_words_path)`

			`def run(self, text):`
			`sentence_list = split_sentences(text)`

			`phrase_list = generate_candidate_keywords(sentence_list, self.__stop_words_pattern)`

			`word_scores = calculate_word_scores(phrase_list)`

			`keyword_candidates = generate_candidate_keyword_scores(phrase_list, word_scores)`

			`sorted_keywords = sorted(keyword_candidates.items(), key=lambda x: x[1], reverse=True)`
			`return sorted_keywords`


			`# if test:`
			# text = "Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types."

			`# # Split text into sentences`
			`# sentenceList = split_sentences(text)`
			`# #stoppath = "FoxStoplist.txt" #Fox stoplist contains "numbers", so it will not find "natural numbers" like in Table 1.1`
			`# stoppath = "SmartStoplist.txt" #SMART stoplist misses some of the lower-scoring keywords in Figure 1.5, which means that the top 1/3 cuts off one of the 4.0 score words in Table 1.1`
			`# stopwordpattern = build_stop_word_regex(stoppath)`

			`# # generate candidate keywords`
			`# phraseList = generate_candidate_keywords(sentenceList, stopwordpattern)`

			`# # calculate individual word scores`
			`# wordscores = calculate_word_scores(phraseList)`

			`# # generate candidate keyword scores`
			`# keywordcandidates = generate_candidate_keyword_scores(phraseList, wordscores)`
			`# if debug: print keywordcandidates`

			`# sortedKeywords = sorted(keywordcandidates.iteritems(), key=operator.itemgetter(1), reverse=True)`
			`# if debug: print sortedKeywords`

			`# totalKeywords = len(sortedKeywords)`
			`# if debug: print totalKeywords`
			`# print sortedKeywords[0:(totalKeywords / 3)]`

			`# rake = Rake("SmartStoplist.txt")`
			`# keywords = rake.run(text)`
			`# print keywords`