169 lines
6.5 KiB
Python
169 lines
6.5 KiB
Python
# Implementation of RAKE - Rapid Automtic Keyword Exraction algorithm
|
|
# as described in:
|
|
# Rose, S., D. Engel, N. Cramer, and W. Cowley (2010).
|
|
# Automatic keyword extraction from indi-vidual documents.
|
|
# In M. W. Berry and J. Kogan (Eds.), Text Mining: Applications and Theory.unknown: John Wiley and Sons, Ltd.
|
|
|
|
import re
|
|
import operator
|
|
|
|
debug = False
|
|
test = True
|
|
|
|
|
|
def is_number(s):
|
|
try:
|
|
float(s) if '.' in s else int(s)
|
|
return True
|
|
except ValueError:
|
|
return False
|
|
|
|
|
|
def load_stop_words(stop_word_file):
|
|
"""
|
|
Utility function to load stop words from a file and return as a list of words
|
|
@param stop_word_file Path and file name of a file containing stop words.
|
|
@return list A list of stop words.
|
|
"""
|
|
stop_words = []
|
|
for line in open(stop_word_file):
|
|
if line.strip()[0:1] != "#":
|
|
for word in line.split(): # in case more than one per line
|
|
stop_words.append(word)
|
|
return stop_words
|
|
|
|
|
|
def separate_words(text, min_word_return_size):
|
|
"""
|
|
Utility function to return a list of all words that are have a length greater than a specified number of characters.
|
|
@param text The text that must be split in to words.
|
|
@param min_word_return_size The minimum no of characters a word must have to be included.
|
|
"""
|
|
splitter = re.compile('[^a-zA-Z0-9_\\+\\-/]')
|
|
words = []
|
|
for single_word in splitter.split(text):
|
|
current_word = single_word.strip().lower()
|
|
#leave numbers in phrase, but don't count as words, since they tend to invalidate scores of their phrases
|
|
if len(current_word) > min_word_return_size and current_word != '' and not is_number(current_word):
|
|
words.append(current_word)
|
|
return words
|
|
|
|
|
|
def split_sentences(text):
|
|
"""
|
|
Utility function to return a list of sentences.
|
|
@param text The text that must be split in to sentences.
|
|
"""
|
|
sentence_delimiters = re.compile(u'[.!?,;:\t\\\\"\\(\\)\\\'\u2019\u2013]|\\s\\-\\s')
|
|
sentences = sentence_delimiters.split(text)
|
|
return sentences
|
|
|
|
|
|
def build_stop_word_regex(stop_word_file_path):
|
|
stop_word_list = load_stop_words(stop_word_file_path)
|
|
stop_word_regex_list = []
|
|
for word in stop_word_list:
|
|
word_regex = r'\b' + word + r'(?![\w-])' # added look ahead for hyphen
|
|
stop_word_regex_list.append(word_regex)
|
|
stop_word_pattern = re.compile('|'.join(stop_word_regex_list), re.IGNORECASE)
|
|
return stop_word_pattern
|
|
|
|
|
|
def generate_candidate_keywords(sentence_list, stopword_pattern):
|
|
phrase_list = []
|
|
for s in sentence_list:
|
|
tmp = re.sub(stopword_pattern, '|', s.strip())
|
|
phrases = tmp.split("|")
|
|
for phrase in phrases:
|
|
phrase = phrase.strip().lower()
|
|
if phrase != "":
|
|
phrase_list.append(phrase)
|
|
return phrase_list
|
|
|
|
|
|
def calculate_word_scores(phraseList):
|
|
word_frequency = {}
|
|
word_degree = {}
|
|
for phrase in phraseList:
|
|
word_list = separate_words(phrase, 0)
|
|
word_list_length = len(word_list)
|
|
word_list_degree = word_list_length - 1
|
|
#if word_list_degree > 3: word_list_degree = 3 #exp.
|
|
for word in word_list:
|
|
word_frequency.setdefault(word, 0)
|
|
word_frequency[word] += 1
|
|
word_degree.setdefault(word, 0)
|
|
word_degree[word] += word_list_degree #orig.
|
|
#word_degree[word] += 1/(word_list_length*1.0) #exp.
|
|
for item in word_frequency:
|
|
word_degree[item] = word_degree[item] + word_frequency[item]
|
|
|
|
# Calculate Word scores = deg(w)/frew(w)
|
|
word_score = {}
|
|
for item in word_frequency:
|
|
word_score.setdefault(item, 0)
|
|
word_score[item] = word_degree[item] / (word_frequency[item] * 1.0) #orig.
|
|
#word_score[item] = word_frequency[item]/(word_degree[item] * 1.0) #exp.
|
|
return word_score
|
|
|
|
|
|
def generate_candidate_keyword_scores(phrase_list, word_score):
|
|
keyword_candidates = {}
|
|
for phrase in phrase_list:
|
|
keyword_candidates.setdefault(phrase, 0)
|
|
word_list = separate_words(phrase, 0)
|
|
candidate_score = 0
|
|
for word in word_list:
|
|
candidate_score += word_score[word]
|
|
keyword_candidates[phrase] = candidate_score
|
|
return keyword_candidates
|
|
|
|
|
|
class Rake(object):
|
|
def __init__(self, stop_words_path):
|
|
self.stop_words_path = stop_words_path
|
|
self.__stop_words_pattern = build_stop_word_regex(stop_words_path)
|
|
|
|
def run(self, text):
|
|
sentence_list = split_sentences(text)
|
|
|
|
phrase_list = generate_candidate_keywords(sentence_list, self.__stop_words_pattern)
|
|
|
|
word_scores = calculate_word_scores(phrase_list)
|
|
|
|
keyword_candidates = generate_candidate_keyword_scores(phrase_list, word_scores)
|
|
|
|
sorted_keywords = sorted(keyword_candidates.items(), key=lambda x: x[1], reverse=True)
|
|
return sorted_keywords
|
|
|
|
|
|
# if test:
|
|
# text = "Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types."
|
|
|
|
# # Split text into sentences
|
|
# sentenceList = split_sentences(text)
|
|
# #stoppath = "FoxStoplist.txt" #Fox stoplist contains "numbers", so it will not find "natural numbers" like in Table 1.1
|
|
# stoppath = "SmartStoplist.txt" #SMART stoplist misses some of the lower-scoring keywords in Figure 1.5, which means that the top 1/3 cuts off one of the 4.0 score words in Table 1.1
|
|
# stopwordpattern = build_stop_word_regex(stoppath)
|
|
|
|
# # generate candidate keywords
|
|
# phraseList = generate_candidate_keywords(sentenceList, stopwordpattern)
|
|
|
|
# # calculate individual word scores
|
|
# wordscores = calculate_word_scores(phraseList)
|
|
|
|
# # generate candidate keyword scores
|
|
# keywordcandidates = generate_candidate_keyword_scores(phraseList, wordscores)
|
|
# if debug: print keywordcandidates
|
|
|
|
# sortedKeywords = sorted(keywordcandidates.iteritems(), key=operator.itemgetter(1), reverse=True)
|
|
# if debug: print sortedKeywords
|
|
|
|
# totalKeywords = len(sortedKeywords)
|
|
# if debug: print totalKeywords
|
|
# print sortedKeywords[0:(totalKeywords / 3)]
|
|
|
|
# rake = Rake("SmartStoplist.txt")
|
|
# keywords = rake.run(text)
|
|
# print keywords
|