# Implementation of RAKE - Rapid Automatic Keyword Extraction algorithm # as described in: # Rose, S., D. Engel, N. Cramer, and W. Cowley (2010). # Automatic keyword extraction from individual documents. # In M. W. Berry and J. Kogan (Eds.), Text Mining: Applications and Theory.unknown: John Wiley and Sons, Ltd. # # NOTE: The original code (from https://github.com/aneesha/RAKE) # has been extended by a_medelyan (zelandiya) # with a set of heuristics to decide whether a phrase is an acceptable candidate # as well as the ability to set frequency and phrase length parameters # important when dealing with longer documents # # NOTE 2: The code published by a_medelyan (https://github.com/zelandiya/RAKE-tutorial) # has been additionally extended by Marco Pegoraro to implement the adjoined candidate # feature described in section 1.2.3 of the original paper. Note that this creates the # need to modify the metric for the candidate score, because the adjoined candidates # have a very high score (because of the nature of the original score metric) from __future__ import absolute_import from __future__ import print_function import re import operator # import six # from six.moves import range from collections import Counter debug = False test = False def is_number(s): try: float(s) if '.' in s else int(s) return True except ValueError: return False def load_stop_words(stop_word_file): """ Utility function to load stop words from a file and return as a list of words @param stop_word_file Path and file name of a file containing stop words. @return list A list of stop words. """ stop_words = [] for line in open(stop_word_file): if line.strip()[0:1] != "#": for word in line.split(): # in case more than one per line stop_words.append(word) return stop_words def separate_words(text, min_word_return_size): """ Utility function to return a list of all words that are have a length greater than a specified number of characters. @param text The text that must be split in to words. @param min_word_return_size The minimum no of characters a word must have to be included. """ splitter = re.compile('[^a-zA-Z0-9_\\+\\-/]') words = [] for single_word in splitter.split(text): current_word = single_word.strip().lower() # leave numbers in phrase, but don't count as words, since they tend to invalidate scores of their phrases if len(current_word) > min_word_return_size and current_word != '' and not is_number(current_word): words.append(current_word) return words def split_sentences(text): """ Utility function to return a list of sentences. @param text The text that must be split in to sentences. """ sentence_delimiters = re.compile(u'[\\[\\]\n.!?,;:\t\\-\\"\\(\\)\\\'\u2019\u2013]') sentences = sentence_delimiters.split(text) return sentences def build_stop_word_regex(stop_word_list): stop_word_regex_list = [] for word in stop_word_list: word_regex = '\\b' + word + '\\b' stop_word_regex_list.append(word_regex) stop_word_pattern = re.compile('|'.join(stop_word_regex_list), re.IGNORECASE) return stop_word_pattern # # Function that extracts the adjoined candidates from a list of sentences and filters them by frequency # def extract_adjoined_candidates(sentence_list, stoplist, min_keywords, max_keywords, min_freq): adjoined_candidates = [] for s in sentence_list: # Extracts the candidates from each single sentence and adds them to the list adjoined_candidates += adjoined_candidates_from_sentence(s, stoplist, min_keywords, max_keywords) # Filters the candidates and returns them return filter_adjoined_candidates(adjoined_candidates, min_freq) # return adjoined_candidates # # Function that extracts the adjoined candidates from a single sentence # def adjoined_candidates_from_sentence(s, stoplist, min_keywords, max_keywords): # Initializes the candidate list to empty candidates = [] # Splits the sentence to get a list of lowercase words sl = s.lower().split() # For each possible length of the adjoined candidate for num_keywords in range(min_keywords, max_keywords + 1): # Until the third-last word for i in range(0, len(sl) - num_keywords): # Position i marks the first word of the candidate. Proceeds only if it's not a stopword if sl[i] not in stoplist: candidate = sl[i] # Initializes j (the pointer to the next word) to 1 j = 1 # Initializes the word counter. This counts the non-stopwords words in the candidate keyword_counter = 1 contains_stopword = False # Until the word count reaches the maximum number of keywords or the end is reached while keyword_counter < num_keywords and i + j < len(sl): # Adds the next word to the candidate candidate = candidate + ' ' + sl[i + j] # If it's not a stopword, increase the word counter. If it is, turn on the flag if sl[i + j] not in stoplist: keyword_counter += 1 else: contains_stopword = True # Next position j += 1 # Adds the candidate to the list only if: # 1) it contains at least a stopword (if it doesn't it's already been considered) # AND # 2) the last word is not a stopword # AND # 3) the adjoined candidate keyphrase contains exactly the correct number of keywords (to avoid doubles) if contains_stopword and candidate.split()[-1] not in stoplist and keyword_counter == num_keywords: candidates.append(candidate) return candidates # # Function that filters the adjoined candidates to keep only those that appears with a certain frequency # def filter_adjoined_candidates(candidates, min_freq): # Creates a dictionary where the key is the candidate and the value is the frequency of the candidate candidates_freq = Counter(candidates) filtered_candidates = [] # Uses the dictionary to filter the candidates for candidate in candidates: freq = candidates_freq[candidate] if freq >= min_freq: filtered_candidates.append(candidate) return filtered_candidates def generate_candidate_keywords(sentence_list, stopword_pattern, stop_word_list, min_char_length=1, max_words_length=5, min_words_length_adj=1, max_words_length_adj=1, min_phrase_freq_adj=2): phrase_list = [] for s in sentence_list: tmp = re.sub(stopword_pattern, '|', s.strip()) phrases = tmp.split("|") for phrase in phrases: phrase = phrase.strip().lower() if phrase != "" and is_acceptable(phrase, min_char_length, max_words_length): phrase_list.append(phrase) phrase_list += extract_adjoined_candidates(sentence_list, stop_word_list, min_words_length_adj, max_words_length_adj, min_phrase_freq_adj) return phrase_list def is_acceptable(phrase, min_char_length, max_words_length): # a phrase must have a min length in characters if len(phrase) < min_char_length: return 0 # a phrase must have a max number of words words = phrase.split() if len(words) > max_words_length: return 0 digits = 0 alpha = 0 for i in range(0, len(phrase)): if phrase[i].isdigit(): digits += 1 elif phrase[i].isalpha(): alpha += 1 # a phrase must have at least one alpha character if alpha == 0: return 0 # a phrase must have more alpha than digits characters if digits > alpha: return 0 return 1 def calculate_word_scores(phraseList): word_frequency = {} word_degree = {} for phrase in phraseList: word_list = separate_words(phrase, 0) word_list_length = len(word_list) word_list_degree = word_list_length - 1 # if word_list_degree > 3: word_list_degree = 3 #exp. for word in word_list: word_frequency.setdefault(word, 0) word_frequency[word] += 1 word_degree.setdefault(word, 0) word_degree[word] += word_list_degree # orig. # word_degree[word] += 1/(word_list_length*1.0) #exp. for item in word_frequency: word_degree[item] = word_degree[item] + word_frequency[item] # Calculate Word scores = deg(w)/frew(w) word_score = {} for item in word_frequency: word_score.setdefault(item, 0) word_score[item] = word_degree[item] / (word_frequency[item] * 1.0) # orig. # word_score[item] = word_frequency[item]/(word_degree[item] * 1.0) #exp. return word_score def generate_candidate_keyword_scores(phrase_list, word_score, min_keyword_frequency=1): keyword_candidates = {} for phrase in phrase_list: if min_keyword_frequency > 1: if phrase_list.count(phrase) < min_keyword_frequency: continue keyword_candidates.setdefault(phrase, 0) word_list = separate_words(phrase, 0) candidate_score = 0 for word in word_list: candidate_score += word_score[word] keyword_candidates[phrase] = candidate_score return keyword_candidates class Rake(object): def __init__(self, stop_words_path, min_char_length=1, max_words_length=5, min_keyword_frequency=1, min_words_length_adj=1, max_words_length_adj=1, min_phrase_freq_adj=2): self.__stop_words_path = stop_words_path self.__stop_words_list = load_stop_words(stop_words_path) self.__min_char_length = min_char_length self.__max_words_length = max_words_length self.__min_keyword_frequency = min_keyword_frequency self.__min_words_length_adj = min_words_length_adj self.__max_words_length_adj = max_words_length_adj self.__min_phrase_freq_adj = min_phrase_freq_adj def run(self, text): sentence_list = split_sentences(text) stop_words_pattern = build_stop_word_regex(self.__stop_words_list) phrase_list = generate_candidate_keywords(sentence_list, stop_words_pattern, self.__stop_words_list, self.__min_char_length, self.__max_words_length, self.__min_words_length_adj, self.__max_words_length_adj, self.__min_phrase_freq_adj) word_scores = calculate_word_scores(phrase_list) keyword_candidates = generate_candidate_keyword_scores(phrase_list, word_scores, self.__min_keyword_frequency) sorted_keywords = sorted(keyword_candidates.items(), key=lambda x: x[1], reverse=True) return sorted_keywords # if test and __name__ == '__main__': # text = "Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types." # # Split text into sentences # sentenceList = split_sentences(text) # # stoppath = "FoxStoplist.txt" #Fox stoplist contains "numbers", so it will not find "natural numbers" like in Table 1.1 # stoppath = "data/stoplists/SmartStoplist.txt" # SMART stoplist misses some of the lower-scoring keywords in Figure 1.5, which means that the top 1/3 cuts off one of the 4.0 score words in Table 1.1 # stopwordpattern = build_stop_word_regex(stoppath) # # generate candidate keywords # phraseList = generate_candidate_keywords(sentenceList, stopwordpattern, load_stop_words(stoppath)) # # calculate individual word scores # wordscores = calculate_word_scores(phraseList) # # generate candidate keyword scores # keywordcandidates = generate_candidate_keyword_scores(phraseList, wordscores) # if debug: print(keywordcandidates) # sortedKeywords = sorted(six.iteritems(keywordcandidates), key=operator.itemgetter(1), reverse=True) # if debug: print(sortedKeywords) # totalKeywords = len(sortedKeywords) # if debug: print(totalKeywords) # print(sortedKeywords[0:(totalKeywords // 3)]) # rake = Rake("data/stoplists/SmartStoplist.txt") # keywords = rake.run(text) # print(keywords)