index(es)
This commit is contained in:
parent
cabfe50777
commit
afc71795d1
75
index.py
Normal file
75
index.py
Normal file
@ -0,0 +1,75 @@
|
||||
import argparse, os, sys, json, logging
|
||||
from keywords import rake
|
||||
from lxml import etree as et
|
||||
|
||||
import yake
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
def index(f):
|
||||
|
||||
if not os.path.isfile(f):
|
||||
logging.error(f + " is not a valid file.")
|
||||
return None
|
||||
|
||||
# filename should be of the type: N.xxxx.xml
|
||||
#ex: 3.Network.xml
|
||||
ch = os.path.basename(f).split('.')[0]
|
||||
|
||||
indx = {}
|
||||
|
||||
r = rake.Rake('keywords/FoxStoplist.txt', max_words_length=3, min_keyword_frequency=1)
|
||||
|
||||
y = yake.KeywordExtractor(lan="en", top=40)
|
||||
|
||||
root = et.parse(f).getroot()
|
||||
|
||||
for m in root.findall('mails/mail'):
|
||||
|
||||
nbr_str = m.find('nbr').text
|
||||
content = m.find('content').text
|
||||
|
||||
# format nbr
|
||||
nbr_str = ch + '.' + nbr_str
|
||||
|
||||
# yake
|
||||
try:
|
||||
kwy = y.extract_keywords(content)
|
||||
for k in kwy:
|
||||
kw = k[0]
|
||||
if kw not in indx:
|
||||
indx[kw] = []
|
||||
indx[kw].append(nbr_str)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
# rake
|
||||
try:
|
||||
kwr = r.run(content)
|
||||
kwr = [x for x in kwr if x[1] > 4.0]
|
||||
for k in kwr:
|
||||
kw = k[0]
|
||||
if kw not in indx:
|
||||
indx[kw] = []
|
||||
indx[kw].append(nbr_str)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
return indx
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
p = argparse.ArgumentParser(description='Builds an index of emails')
|
||||
p.add_argument('file', metavar="f", help="xml file to index")
|
||||
|
||||
args = p.parse_args()
|
||||
|
||||
ind = index(args.file)
|
||||
ind_out = {'selected': {}, 'orphan': ind}
|
||||
|
||||
print(json.dumps(ind_out, indent=4, sort_keys=True, ensure_ascii=False))
|
||||
|
||||
|
||||
|
||||
34
index.sh
Executable file
34
index.sh
Executable file
@ -0,0 +1,34 @@
|
||||
#!/bin/sh
|
||||
|
||||
echo "1.Welcome.xml -> 1.Welcome-index.js"
|
||||
python index.py xml/1.Welcome.xml > index/1.Welcome-index.js
|
||||
echo "2.DeepEurope.xml -> 2.DeepEurope-index.js"
|
||||
python index.py xml/2.DeepEurope.xml > index/2.DeepEurope-index.js
|
||||
echo "3.Network.xml -> 3.Network-index.js"
|
||||
python index.py xml/3.Network.xml > index/3.Network-index.js
|
||||
echo "4.net.art.xml -> 4.net.art-index.js"
|
||||
python index.py xml/4.net.art.xml > index/4.net.art-index.js
|
||||
echo "5.Netzkritik.xml -> 5.Netzkritik-index.js"
|
||||
python index.py xml/5.Netzkritik.xml > index/5.Netzkritik-index.js
|
||||
echo "6.FLOSS.xml -> 6.FLOSS-index.js"
|
||||
python index.py xml/6.FLOSS.xml > index/6.FLOSS-index.js
|
||||
echo "7.Critique_Art_Politics.xml -> 7.Critique_Art_Politics-index.js"
|
||||
python index.py xml/7.Critique_Art_Politics.xml > index/7.Critique_Art_Politics-index.js
|
||||
echo "8.TacticalMedia.xml -> 8.TacticalMedia-index.js"
|
||||
python index.py xml/8.TacticalMedia.xml > index/8.TacticalMedia-index.js
|
||||
echo "9.List_talking_to_List.xml -> 9.List_talking_to_List-index.js"
|
||||
python index.py xml/9.List_talking_to_List.xml > index/9.List_talking_to_List-index.js
|
||||
echo "10.Cyberfeminism.xml -> 10.Cyberfeminism-index.js"
|
||||
python index.py xml/10.Cyberfeminism.xml > index/10.Cyberfeminism-index.js
|
||||
echo "11.CODE.xml -> 11.CODE-index.js"
|
||||
python index.py xml/11.CODE.xml > index/11.CODE-index.js
|
||||
echo "13.Post-digital.xml -> 13.Post-digital-index.js"
|
||||
python index.py xml/13.Post-digital.xml > index/13.Post-digital-index.js
|
||||
echo "14.MANIFESTO.xml -> 14.MANIFESTO-index.js"
|
||||
python index.py xml/14.MANIFESTO.xml > index/14.MANIFESTO-index.js
|
||||
echo "15.LutherBlissett.xml -> 15.LutherBlissett-index.js"
|
||||
python index.py xml/15.LutherBlissett.xml > index/15.LutherBlissett-index.js
|
||||
echo "16.NN.xml -> 16.NN-index.js"
|
||||
python index.py xml/16.NN.xml > index/16.NN-index.js
|
||||
echo "17.Interviews.xml -> 17.Interviews-index.js"
|
||||
python index.py xml/17.Interviews.xml > index/17.Interviews-index.js
|
||||
1039
index/1.Welcome-index.js
Normal file
1039
index/1.Welcome-index.js
Normal file
File diff suppressed because it is too large
Load Diff
5072
index/10.Cyberfeminism-index.js
Normal file
5072
index/10.Cyberfeminism-index.js
Normal file
File diff suppressed because it is too large
Load Diff
5902
index/11.CODE-index.js
Normal file
5902
index/11.CODE-index.js
Normal file
File diff suppressed because it is too large
Load Diff
8037
index/13.Post-digital-index.js
Normal file
8037
index/13.Post-digital-index.js
Normal file
File diff suppressed because it is too large
Load Diff
7353
index/14.MANIFESTO-index.js
Normal file
7353
index/14.MANIFESTO-index.js
Normal file
File diff suppressed because it is too large
Load Diff
6879
index/15.LutherBlissett-index.js
Normal file
6879
index/15.LutherBlissett-index.js
Normal file
File diff suppressed because it is too large
Load Diff
21363
index/16.NN-index.js
Normal file
21363
index/16.NN-index.js
Normal file
File diff suppressed because it is too large
Load Diff
23676
index/17.Interviews-index.js
Normal file
23676
index/17.Interviews-index.js
Normal file
File diff suppressed because it is too large
Load Diff
2894
index/2.DeepEurope-index.js
Normal file
2894
index/2.DeepEurope-index.js
Normal file
File diff suppressed because it is too large
Load Diff
16602
index/3.Network-index.js
Normal file
16602
index/3.Network-index.js
Normal file
File diff suppressed because it is too large
Load Diff
9845
index/4.net.art-index.js
Normal file
9845
index/4.net.art-index.js
Normal file
File diff suppressed because it is too large
Load Diff
4579
index/5.Netzkritik-index.js
Normal file
4579
index/5.Netzkritik-index.js
Normal file
File diff suppressed because it is too large
Load Diff
5835
index/6.FLOSS-index.js
Normal file
5835
index/6.FLOSS-index.js
Normal file
File diff suppressed because it is too large
Load Diff
4849
index/7.Critique_Art_Politics-index.js
Normal file
4849
index/7.Critique_Art_Politics-index.js
Normal file
File diff suppressed because it is too large
Load Diff
11893
index/8.TacticalMedia-index.js
Normal file
11893
index/8.TacticalMedia-index.js
Normal file
File diff suppressed because it is too large
Load Diff
14346
index/9.List_talking_to_List-index.js
Normal file
14346
index/9.List_talking_to_List-index.js
Normal file
File diff suppressed because it is too large
Load Diff
426
keywords/FoxStoplist.txt
Normal file
426
keywords/FoxStoplist.txt
Normal file
@ -0,0 +1,426 @@
|
||||
#From "A stop list for general text" Fox 1989
|
||||
a
|
||||
about
|
||||
above
|
||||
across
|
||||
after
|
||||
again
|
||||
against
|
||||
all
|
||||
almost
|
||||
alone
|
||||
along
|
||||
already
|
||||
also
|
||||
although
|
||||
always
|
||||
among
|
||||
an
|
||||
and
|
||||
another
|
||||
any
|
||||
anybody
|
||||
anyone
|
||||
anything
|
||||
anywhere
|
||||
are
|
||||
area
|
||||
areas
|
||||
around
|
||||
as
|
||||
ask
|
||||
asked
|
||||
asking
|
||||
asks
|
||||
at
|
||||
away
|
||||
b
|
||||
back
|
||||
backed
|
||||
backing
|
||||
backs
|
||||
be
|
||||
because
|
||||
became
|
||||
become
|
||||
becomes
|
||||
been
|
||||
before
|
||||
began
|
||||
behind
|
||||
being
|
||||
beings
|
||||
best
|
||||
better
|
||||
between
|
||||
big
|
||||
both
|
||||
but
|
||||
by
|
||||
c
|
||||
came
|
||||
can
|
||||
cannot
|
||||
case
|
||||
cases
|
||||
certain
|
||||
certainly
|
||||
clear
|
||||
clearly
|
||||
come
|
||||
could
|
||||
d
|
||||
did
|
||||
differ
|
||||
different
|
||||
differently
|
||||
do
|
||||
does
|
||||
done
|
||||
down
|
||||
downed
|
||||
downing
|
||||
downs
|
||||
during
|
||||
e
|
||||
each
|
||||
early
|
||||
either
|
||||
end
|
||||
ended
|
||||
ending
|
||||
ends
|
||||
enough
|
||||
even
|
||||
evenly
|
||||
ever
|
||||
every
|
||||
everybody
|
||||
everyone
|
||||
everything
|
||||
everywhere
|
||||
f
|
||||
face
|
||||
faces
|
||||
fact
|
||||
facts
|
||||
far
|
||||
felt
|
||||
few
|
||||
find
|
||||
finds
|
||||
first
|
||||
for
|
||||
four
|
||||
from
|
||||
full
|
||||
fully
|
||||
further
|
||||
furthered
|
||||
furthering
|
||||
furthers
|
||||
g
|
||||
gave
|
||||
general
|
||||
generally
|
||||
get
|
||||
gets
|
||||
give
|
||||
given
|
||||
gives
|
||||
go
|
||||
going
|
||||
good
|
||||
goods
|
||||
got
|
||||
great
|
||||
greater
|
||||
greatest
|
||||
group
|
||||
grouped
|
||||
grouping
|
||||
groups
|
||||
h
|
||||
had
|
||||
has
|
||||
have
|
||||
having
|
||||
he
|
||||
her
|
||||
herself
|
||||
here
|
||||
high
|
||||
higher
|
||||
highest
|
||||
him
|
||||
himself
|
||||
his
|
||||
how
|
||||
however
|
||||
i
|
||||
if
|
||||
important
|
||||
in
|
||||
interest
|
||||
interested
|
||||
interesting
|
||||
interests
|
||||
into
|
||||
is
|
||||
it
|
||||
its
|
||||
itself
|
||||
j
|
||||
just
|
||||
k
|
||||
keep
|
||||
keeps
|
||||
kind
|
||||
knew
|
||||
know
|
||||
known
|
||||
knows
|
||||
l
|
||||
large
|
||||
largely
|
||||
last
|
||||
later
|
||||
latest
|
||||
least
|
||||
less
|
||||
let
|
||||
lets
|
||||
like
|
||||
likely
|
||||
long
|
||||
longer
|
||||
longest
|
||||
m
|
||||
made
|
||||
make
|
||||
making
|
||||
man
|
||||
many
|
||||
may
|
||||
me
|
||||
member
|
||||
members
|
||||
men
|
||||
might
|
||||
more
|
||||
most
|
||||
mostly
|
||||
mr
|
||||
mrs
|
||||
much
|
||||
must
|
||||
my
|
||||
myself
|
||||
n
|
||||
necessary
|
||||
need
|
||||
needed
|
||||
needing
|
||||
needs
|
||||
never
|
||||
new
|
||||
newer
|
||||
newest
|
||||
next
|
||||
no
|
||||
non
|
||||
not
|
||||
nobody
|
||||
noone
|
||||
nothing
|
||||
now
|
||||
nowhere
|
||||
number
|
||||
numbered
|
||||
numbering
|
||||
numbers
|
||||
o
|
||||
of
|
||||
off
|
||||
often
|
||||
old
|
||||
older
|
||||
oldest
|
||||
on
|
||||
once
|
||||
one
|
||||
only
|
||||
open
|
||||
opened
|
||||
opening
|
||||
opens
|
||||
or
|
||||
order
|
||||
ordered
|
||||
ordering
|
||||
orders
|
||||
other
|
||||
others
|
||||
our
|
||||
out
|
||||
over
|
||||
p
|
||||
part
|
||||
parted
|
||||
parting
|
||||
parts
|
||||
per
|
||||
perhaps
|
||||
place
|
||||
places
|
||||
point
|
||||
pointed
|
||||
pointing
|
||||
points
|
||||
possible
|
||||
present
|
||||
presented
|
||||
presenting
|
||||
presents
|
||||
problem
|
||||
problems
|
||||
put
|
||||
puts
|
||||
q
|
||||
quite
|
||||
r
|
||||
rather
|
||||
really
|
||||
right
|
||||
room
|
||||
rooms
|
||||
s
|
||||
said
|
||||
same
|
||||
saw
|
||||
say
|
||||
says
|
||||
second
|
||||
seconds
|
||||
see
|
||||
seem
|
||||
seemed
|
||||
seeming
|
||||
seems
|
||||
sees
|
||||
several
|
||||
shall
|
||||
she
|
||||
should
|
||||
show
|
||||
showed
|
||||
showing
|
||||
shows
|
||||
side
|
||||
sides
|
||||
since
|
||||
small
|
||||
smaller
|
||||
smallest
|
||||
so
|
||||
some
|
||||
somebody
|
||||
someone
|
||||
something
|
||||
somewhere
|
||||
state
|
||||
states
|
||||
still
|
||||
such
|
||||
sure
|
||||
t
|
||||
take
|
||||
taken
|
||||
than
|
||||
that
|
||||
the
|
||||
their
|
||||
them
|
||||
then
|
||||
there
|
||||
therefore
|
||||
these
|
||||
they
|
||||
thing
|
||||
things
|
||||
think
|
||||
thinks
|
||||
this
|
||||
those
|
||||
though
|
||||
thought
|
||||
thoughts
|
||||
three
|
||||
through
|
||||
thus
|
||||
to
|
||||
today
|
||||
together
|
||||
too
|
||||
took
|
||||
toward
|
||||
turn
|
||||
turned
|
||||
turning
|
||||
turns
|
||||
two
|
||||
u
|
||||
under
|
||||
until
|
||||
up
|
||||
upon
|
||||
us
|
||||
use
|
||||
uses
|
||||
used
|
||||
v
|
||||
very
|
||||
w
|
||||
want
|
||||
wanted
|
||||
wanting
|
||||
wants
|
||||
was
|
||||
way
|
||||
ways
|
||||
we
|
||||
well
|
||||
wells
|
||||
went
|
||||
were
|
||||
what
|
||||
when
|
||||
where
|
||||
whether
|
||||
which
|
||||
while
|
||||
who
|
||||
whole
|
||||
whose
|
||||
why
|
||||
will
|
||||
with
|
||||
within
|
||||
without
|
||||
work
|
||||
worked
|
||||
working
|
||||
works
|
||||
would
|
||||
x
|
||||
y
|
||||
year
|
||||
years
|
||||
yet
|
||||
you
|
||||
young
|
||||
younger
|
||||
youngest
|
||||
your
|
||||
yours
|
||||
z
|
||||
168
keywords/oldrake.py
Normal file
168
keywords/oldrake.py
Normal file
@ -0,0 +1,168 @@
|
||||
# Implementation of RAKE - Rapid Automtic Keyword Exraction algorithm
|
||||
# as described in:
|
||||
# Rose, S., D. Engel, N. Cramer, and W. Cowley (2010).
|
||||
# Automatic keyword extraction from indi-vidual documents.
|
||||
# In M. W. Berry and J. Kogan (Eds.), Text Mining: Applications and Theory.unknown: John Wiley and Sons, Ltd.
|
||||
|
||||
import re
|
||||
import operator
|
||||
|
||||
debug = False
|
||||
test = True
|
||||
|
||||
|
||||
def is_number(s):
|
||||
try:
|
||||
float(s) if '.' in s else int(s)
|
||||
return True
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
|
||||
def load_stop_words(stop_word_file):
|
||||
"""
|
||||
Utility function to load stop words from a file and return as a list of words
|
||||
@param stop_word_file Path and file name of a file containing stop words.
|
||||
@return list A list of stop words.
|
||||
"""
|
||||
stop_words = []
|
||||
for line in open(stop_word_file):
|
||||
if line.strip()[0:1] != "#":
|
||||
for word in line.split(): # in case more than one per line
|
||||
stop_words.append(word)
|
||||
return stop_words
|
||||
|
||||
|
||||
def separate_words(text, min_word_return_size):
|
||||
"""
|
||||
Utility function to return a list of all words that are have a length greater than a specified number of characters.
|
||||
@param text The text that must be split in to words.
|
||||
@param min_word_return_size The minimum no of characters a word must have to be included.
|
||||
"""
|
||||
splitter = re.compile('[^a-zA-Z0-9_\\+\\-/]')
|
||||
words = []
|
||||
for single_word in splitter.split(text):
|
||||
current_word = single_word.strip().lower()
|
||||
#leave numbers in phrase, but don't count as words, since they tend to invalidate scores of their phrases
|
||||
if len(current_word) > min_word_return_size and current_word != '' and not is_number(current_word):
|
||||
words.append(current_word)
|
||||
return words
|
||||
|
||||
|
||||
def split_sentences(text):
|
||||
"""
|
||||
Utility function to return a list of sentences.
|
||||
@param text The text that must be split in to sentences.
|
||||
"""
|
||||
sentence_delimiters = re.compile(u'[.!?,;:\t\\\\"\\(\\)\\\'\u2019\u2013]|\\s\\-\\s')
|
||||
sentences = sentence_delimiters.split(text)
|
||||
return sentences
|
||||
|
||||
|
||||
def build_stop_word_regex(stop_word_file_path):
|
||||
stop_word_list = load_stop_words(stop_word_file_path)
|
||||
stop_word_regex_list = []
|
||||
for word in stop_word_list:
|
||||
word_regex = r'\b' + word + r'(?![\w-])' # added look ahead for hyphen
|
||||
stop_word_regex_list.append(word_regex)
|
||||
stop_word_pattern = re.compile('|'.join(stop_word_regex_list), re.IGNORECASE)
|
||||
return stop_word_pattern
|
||||
|
||||
|
||||
def generate_candidate_keywords(sentence_list, stopword_pattern):
|
||||
phrase_list = []
|
||||
for s in sentence_list:
|
||||
tmp = re.sub(stopword_pattern, '|', s.strip())
|
||||
phrases = tmp.split("|")
|
||||
for phrase in phrases:
|
||||
phrase = phrase.strip().lower()
|
||||
if phrase != "":
|
||||
phrase_list.append(phrase)
|
||||
return phrase_list
|
||||
|
||||
|
||||
def calculate_word_scores(phraseList):
|
||||
word_frequency = {}
|
||||
word_degree = {}
|
||||
for phrase in phraseList:
|
||||
word_list = separate_words(phrase, 0)
|
||||
word_list_length = len(word_list)
|
||||
word_list_degree = word_list_length - 1
|
||||
#if word_list_degree > 3: word_list_degree = 3 #exp.
|
||||
for word in word_list:
|
||||
word_frequency.setdefault(word, 0)
|
||||
word_frequency[word] += 1
|
||||
word_degree.setdefault(word, 0)
|
||||
word_degree[word] += word_list_degree #orig.
|
||||
#word_degree[word] += 1/(word_list_length*1.0) #exp.
|
||||
for item in word_frequency:
|
||||
word_degree[item] = word_degree[item] + word_frequency[item]
|
||||
|
||||
# Calculate Word scores = deg(w)/frew(w)
|
||||
word_score = {}
|
||||
for item in word_frequency:
|
||||
word_score.setdefault(item, 0)
|
||||
word_score[item] = word_degree[item] / (word_frequency[item] * 1.0) #orig.
|
||||
#word_score[item] = word_frequency[item]/(word_degree[item] * 1.0) #exp.
|
||||
return word_score
|
||||
|
||||
|
||||
def generate_candidate_keyword_scores(phrase_list, word_score):
|
||||
keyword_candidates = {}
|
||||
for phrase in phrase_list:
|
||||
keyword_candidates.setdefault(phrase, 0)
|
||||
word_list = separate_words(phrase, 0)
|
||||
candidate_score = 0
|
||||
for word in word_list:
|
||||
candidate_score += word_score[word]
|
||||
keyword_candidates[phrase] = candidate_score
|
||||
return keyword_candidates
|
||||
|
||||
|
||||
class Rake(object):
|
||||
def __init__(self, stop_words_path):
|
||||
self.stop_words_path = stop_words_path
|
||||
self.__stop_words_pattern = build_stop_word_regex(stop_words_path)
|
||||
|
||||
def run(self, text):
|
||||
sentence_list = split_sentences(text)
|
||||
|
||||
phrase_list = generate_candidate_keywords(sentence_list, self.__stop_words_pattern)
|
||||
|
||||
word_scores = calculate_word_scores(phrase_list)
|
||||
|
||||
keyword_candidates = generate_candidate_keyword_scores(phrase_list, word_scores)
|
||||
|
||||
sorted_keywords = sorted(keyword_candidates.items(), key=lambda x: x[1], reverse=True)
|
||||
return sorted_keywords
|
||||
|
||||
|
||||
# if test:
|
||||
# text = "Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types."
|
||||
|
||||
# # Split text into sentences
|
||||
# sentenceList = split_sentences(text)
|
||||
# #stoppath = "FoxStoplist.txt" #Fox stoplist contains "numbers", so it will not find "natural numbers" like in Table 1.1
|
||||
# stoppath = "SmartStoplist.txt" #SMART stoplist misses some of the lower-scoring keywords in Figure 1.5, which means that the top 1/3 cuts off one of the 4.0 score words in Table 1.1
|
||||
# stopwordpattern = build_stop_word_regex(stoppath)
|
||||
|
||||
# # generate candidate keywords
|
||||
# phraseList = generate_candidate_keywords(sentenceList, stopwordpattern)
|
||||
|
||||
# # calculate individual word scores
|
||||
# wordscores = calculate_word_scores(phraseList)
|
||||
|
||||
# # generate candidate keyword scores
|
||||
# keywordcandidates = generate_candidate_keyword_scores(phraseList, wordscores)
|
||||
# if debug: print keywordcandidates
|
||||
|
||||
# sortedKeywords = sorted(keywordcandidates.iteritems(), key=operator.itemgetter(1), reverse=True)
|
||||
# if debug: print sortedKeywords
|
||||
|
||||
# totalKeywords = len(sortedKeywords)
|
||||
# if debug: print totalKeywords
|
||||
# print sortedKeywords[0:(totalKeywords / 3)]
|
||||
|
||||
# rake = Rake("SmartStoplist.txt")
|
||||
# keywords = rake.run(text)
|
||||
# print keywords
|
||||
301
keywords/rake.py
Normal file
301
keywords/rake.py
Normal file
@ -0,0 +1,301 @@
|
||||
# Implementation of RAKE - Rapid Automatic Keyword Extraction algorithm
|
||||
# as described in:
|
||||
# Rose, S., D. Engel, N. Cramer, and W. Cowley (2010).
|
||||
# Automatic keyword extraction from individual documents.
|
||||
# In M. W. Berry and J. Kogan (Eds.), Text Mining: Applications and Theory.unknown: John Wiley and Sons, Ltd.
|
||||
#
|
||||
# NOTE: The original code (from https://github.com/aneesha/RAKE)
|
||||
# has been extended by a_medelyan (zelandiya)
|
||||
# with a set of heuristics to decide whether a phrase is an acceptable candidate
|
||||
# as well as the ability to set frequency and phrase length parameters
|
||||
# important when dealing with longer documents
|
||||
#
|
||||
# NOTE 2: The code published by a_medelyan (https://github.com/zelandiya/RAKE-tutorial)
|
||||
# has been additionally extended by Marco Pegoraro to implement the adjoined candidate
|
||||
# feature described in section 1.2.3 of the original paper. Note that this creates the
|
||||
# need to modify the metric for the candidate score, because the adjoined candidates
|
||||
# have a very high score (because of the nature of the original score metric)
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import print_function
|
||||
import re
|
||||
import operator
|
||||
# import six
|
||||
# from six.moves import range
|
||||
from collections import Counter
|
||||
|
||||
debug = False
|
||||
test = False
|
||||
|
||||
|
||||
def is_number(s):
|
||||
try:
|
||||
float(s) if '.' in s else int(s)
|
||||
return True
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
|
||||
def load_stop_words(stop_word_file):
|
||||
"""
|
||||
Utility function to load stop words from a file and return as a list of words
|
||||
@param stop_word_file Path and file name of a file containing stop words.
|
||||
@return list A list of stop words.
|
||||
"""
|
||||
stop_words = []
|
||||
for line in open(stop_word_file):
|
||||
if line.strip()[0:1] != "#":
|
||||
for word in line.split(): # in case more than one per line
|
||||
stop_words.append(word)
|
||||
return stop_words
|
||||
|
||||
|
||||
def separate_words(text, min_word_return_size):
|
||||
"""
|
||||
Utility function to return a list of all words that are have a length greater than a specified number of characters.
|
||||
@param text The text that must be split in to words.
|
||||
@param min_word_return_size The minimum no of characters a word must have to be included.
|
||||
"""
|
||||
splitter = re.compile('[^a-zA-Z0-9_\\+\\-/]')
|
||||
words = []
|
||||
for single_word in splitter.split(text):
|
||||
current_word = single_word.strip().lower()
|
||||
# leave numbers in phrase, but don't count as words, since they tend to invalidate scores of their phrases
|
||||
if len(current_word) > min_word_return_size and current_word != '' and not is_number(current_word):
|
||||
words.append(current_word)
|
||||
return words
|
||||
|
||||
|
||||
def split_sentences(text):
|
||||
"""
|
||||
Utility function to return a list of sentences.
|
||||
@param text The text that must be split in to sentences.
|
||||
"""
|
||||
sentence_delimiters = re.compile(u'[\\[\\]\n.!?,;:\t\\-\\"\\(\\)\\\'\u2019\u2013]')
|
||||
sentences = sentence_delimiters.split(text)
|
||||
return sentences
|
||||
|
||||
|
||||
def build_stop_word_regex(stop_word_list):
|
||||
stop_word_regex_list = []
|
||||
for word in stop_word_list:
|
||||
word_regex = '\\b' + word + '\\b'
|
||||
stop_word_regex_list.append(word_regex)
|
||||
stop_word_pattern = re.compile('|'.join(stop_word_regex_list), re.IGNORECASE)
|
||||
return stop_word_pattern
|
||||
|
||||
|
||||
#
|
||||
# Function that extracts the adjoined candidates from a list of sentences and filters them by frequency
|
||||
#
|
||||
def extract_adjoined_candidates(sentence_list, stoplist, min_keywords, max_keywords, min_freq):
|
||||
adjoined_candidates = []
|
||||
for s in sentence_list:
|
||||
# Extracts the candidates from each single sentence and adds them to the list
|
||||
adjoined_candidates += adjoined_candidates_from_sentence(s, stoplist, min_keywords, max_keywords)
|
||||
# Filters the candidates and returns them
|
||||
return filter_adjoined_candidates(adjoined_candidates, min_freq)
|
||||
|
||||
|
||||
# return adjoined_candidates
|
||||
|
||||
#
|
||||
# Function that extracts the adjoined candidates from a single sentence
|
||||
#
|
||||
def adjoined_candidates_from_sentence(s, stoplist, min_keywords, max_keywords):
|
||||
# Initializes the candidate list to empty
|
||||
candidates = []
|
||||
# Splits the sentence to get a list of lowercase words
|
||||
sl = s.lower().split()
|
||||
# For each possible length of the adjoined candidate
|
||||
for num_keywords in range(min_keywords, max_keywords + 1):
|
||||
# Until the third-last word
|
||||
for i in range(0, len(sl) - num_keywords):
|
||||
# Position i marks the first word of the candidate. Proceeds only if it's not a stopword
|
||||
if sl[i] not in stoplist:
|
||||
candidate = sl[i]
|
||||
# Initializes j (the pointer to the next word) to 1
|
||||
j = 1
|
||||
# Initializes the word counter. This counts the non-stopwords words in the candidate
|
||||
keyword_counter = 1
|
||||
contains_stopword = False
|
||||
# Until the word count reaches the maximum number of keywords or the end is reached
|
||||
while keyword_counter < num_keywords and i + j < len(sl):
|
||||
# Adds the next word to the candidate
|
||||
candidate = candidate + ' ' + sl[i + j]
|
||||
# If it's not a stopword, increase the word counter. If it is, turn on the flag
|
||||
if sl[i + j] not in stoplist:
|
||||
keyword_counter += 1
|
||||
else:
|
||||
contains_stopword = True
|
||||
# Next position
|
||||
j += 1
|
||||
# Adds the candidate to the list only if:
|
||||
# 1) it contains at least a stopword (if it doesn't it's already been considered)
|
||||
# AND
|
||||
# 2) the last word is not a stopword
|
||||
# AND
|
||||
# 3) the adjoined candidate keyphrase contains exactly the correct number of keywords (to avoid doubles)
|
||||
if contains_stopword and candidate.split()[-1] not in stoplist and keyword_counter == num_keywords:
|
||||
candidates.append(candidate)
|
||||
return candidates
|
||||
|
||||
|
||||
#
|
||||
# Function that filters the adjoined candidates to keep only those that appears with a certain frequency
|
||||
#
|
||||
def filter_adjoined_candidates(candidates, min_freq):
|
||||
# Creates a dictionary where the key is the candidate and the value is the frequency of the candidate
|
||||
candidates_freq = Counter(candidates)
|
||||
filtered_candidates = []
|
||||
# Uses the dictionary to filter the candidates
|
||||
for candidate in candidates:
|
||||
freq = candidates_freq[candidate]
|
||||
if freq >= min_freq:
|
||||
filtered_candidates.append(candidate)
|
||||
return filtered_candidates
|
||||
|
||||
|
||||
def generate_candidate_keywords(sentence_list, stopword_pattern, stop_word_list, min_char_length=1, max_words_length=5,
|
||||
min_words_length_adj=1, max_words_length_adj=1, min_phrase_freq_adj=2):
|
||||
phrase_list = []
|
||||
for s in sentence_list:
|
||||
tmp = re.sub(stopword_pattern, '|', s.strip())
|
||||
phrases = tmp.split("|")
|
||||
for phrase in phrases:
|
||||
phrase = phrase.strip().lower()
|
||||
if phrase != "" and is_acceptable(phrase, min_char_length, max_words_length):
|
||||
phrase_list.append(phrase)
|
||||
phrase_list += extract_adjoined_candidates(sentence_list, stop_word_list, min_words_length_adj,
|
||||
max_words_length_adj, min_phrase_freq_adj)
|
||||
return phrase_list
|
||||
|
||||
|
||||
def is_acceptable(phrase, min_char_length, max_words_length):
|
||||
# a phrase must have a min length in characters
|
||||
if len(phrase) < min_char_length:
|
||||
return 0
|
||||
|
||||
# a phrase must have a max number of words
|
||||
words = phrase.split()
|
||||
if len(words) > max_words_length:
|
||||
return 0
|
||||
|
||||
digits = 0
|
||||
alpha = 0
|
||||
for i in range(0, len(phrase)):
|
||||
if phrase[i].isdigit():
|
||||
digits += 1
|
||||
elif phrase[i].isalpha():
|
||||
alpha += 1
|
||||
|
||||
# a phrase must have at least one alpha character
|
||||
if alpha == 0:
|
||||
return 0
|
||||
|
||||
# a phrase must have more alpha than digits characters
|
||||
if digits > alpha:
|
||||
return 0
|
||||
return 1
|
||||
|
||||
|
||||
def calculate_word_scores(phraseList):
|
||||
word_frequency = {}
|
||||
word_degree = {}
|
||||
for phrase in phraseList:
|
||||
word_list = separate_words(phrase, 0)
|
||||
word_list_length = len(word_list)
|
||||
word_list_degree = word_list_length - 1
|
||||
# if word_list_degree > 3: word_list_degree = 3 #exp.
|
||||
for word in word_list:
|
||||
word_frequency.setdefault(word, 0)
|
||||
word_frequency[word] += 1
|
||||
word_degree.setdefault(word, 0)
|
||||
word_degree[word] += word_list_degree # orig.
|
||||
# word_degree[word] += 1/(word_list_length*1.0) #exp.
|
||||
for item in word_frequency:
|
||||
word_degree[item] = word_degree[item] + word_frequency[item]
|
||||
|
||||
# Calculate Word scores = deg(w)/frew(w)
|
||||
word_score = {}
|
||||
for item in word_frequency:
|
||||
word_score.setdefault(item, 0)
|
||||
word_score[item] = word_degree[item] / (word_frequency[item] * 1.0) # orig.
|
||||
# word_score[item] = word_frequency[item]/(word_degree[item] * 1.0) #exp.
|
||||
return word_score
|
||||
|
||||
|
||||
def generate_candidate_keyword_scores(phrase_list, word_score, min_keyword_frequency=1):
|
||||
keyword_candidates = {}
|
||||
for phrase in phrase_list:
|
||||
if min_keyword_frequency > 1:
|
||||
if phrase_list.count(phrase) < min_keyword_frequency:
|
||||
continue
|
||||
keyword_candidates.setdefault(phrase, 0)
|
||||
word_list = separate_words(phrase, 0)
|
||||
candidate_score = 0
|
||||
for word in word_list:
|
||||
candidate_score += word_score[word]
|
||||
keyword_candidates[phrase] = candidate_score
|
||||
return keyword_candidates
|
||||
|
||||
|
||||
class Rake(object):
|
||||
def __init__(self, stop_words_path, min_char_length=1, max_words_length=5, min_keyword_frequency=1,
|
||||
min_words_length_adj=1, max_words_length_adj=1, min_phrase_freq_adj=2):
|
||||
self.__stop_words_path = stop_words_path
|
||||
self.__stop_words_list = load_stop_words(stop_words_path)
|
||||
self.__min_char_length = min_char_length
|
||||
self.__max_words_length = max_words_length
|
||||
self.__min_keyword_frequency = min_keyword_frequency
|
||||
self.__min_words_length_adj = min_words_length_adj
|
||||
self.__max_words_length_adj = max_words_length_adj
|
||||
self.__min_phrase_freq_adj = min_phrase_freq_adj
|
||||
|
||||
def run(self, text):
|
||||
sentence_list = split_sentences(text)
|
||||
|
||||
stop_words_pattern = build_stop_word_regex(self.__stop_words_list)
|
||||
|
||||
phrase_list = generate_candidate_keywords(sentence_list, stop_words_pattern, self.__stop_words_list,
|
||||
self.__min_char_length, self.__max_words_length,
|
||||
self.__min_words_length_adj, self.__max_words_length_adj,
|
||||
self.__min_phrase_freq_adj)
|
||||
|
||||
word_scores = calculate_word_scores(phrase_list)
|
||||
|
||||
keyword_candidates = generate_candidate_keyword_scores(phrase_list, word_scores, self.__min_keyword_frequency)
|
||||
|
||||
sorted_keywords = sorted(keyword_candidates.items(), key=lambda x: x[1], reverse=True)
|
||||
return sorted_keywords
|
||||
|
||||
|
||||
# if test and __name__ == '__main__':
|
||||
# text = "Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types."
|
||||
|
||||
# # Split text into sentences
|
||||
# sentenceList = split_sentences(text)
|
||||
# # stoppath = "FoxStoplist.txt" #Fox stoplist contains "numbers", so it will not find "natural numbers" like in Table 1.1
|
||||
# stoppath = "data/stoplists/SmartStoplist.txt" # SMART stoplist misses some of the lower-scoring keywords in Figure 1.5, which means that the top 1/3 cuts off one of the 4.0 score words in Table 1.1
|
||||
# stopwordpattern = build_stop_word_regex(stoppath)
|
||||
|
||||
# # generate candidate keywords
|
||||
# phraseList = generate_candidate_keywords(sentenceList, stopwordpattern, load_stop_words(stoppath))
|
||||
|
||||
# # calculate individual word scores
|
||||
# wordscores = calculate_word_scores(phraseList)
|
||||
|
||||
# # generate candidate keyword scores
|
||||
# keywordcandidates = generate_candidate_keyword_scores(phraseList, wordscores)
|
||||
# if debug: print(keywordcandidates)
|
||||
|
||||
# sortedKeywords = sorted(six.iteritems(keywordcandidates), key=operator.itemgetter(1), reverse=True)
|
||||
# if debug: print(sortedKeywords)
|
||||
|
||||
# totalKeywords = len(sortedKeywords)
|
||||
# if debug: print(totalKeywords)
|
||||
# print(sortedKeywords[0:(totalKeywords // 3)])
|
||||
|
||||
# rake = Rake("data/stoplists/SmartStoplist.txt")
|
||||
# keywords = rake.run(text)
|
||||
# print(keywords)
|
||||
119
www/routes.py
119
www/routes.py
@ -3,6 +3,7 @@ from www import app
|
||||
import json, logging, os, glob
|
||||
from lxml import etree as et
|
||||
import config
|
||||
from collections import OrderedDict
|
||||
|
||||
def list_all(d, ext):
|
||||
|
||||
@ -12,6 +13,112 @@ def list_all(d, ext):
|
||||
|
||||
return [os.path.basename(f) for f in glob.glob(os.path.join(d, "*." + ext))]
|
||||
|
||||
@app.route('/')
|
||||
def top():
|
||||
return render_template("index.html")
|
||||
|
||||
'''
|
||||
INDEX
|
||||
'''
|
||||
|
||||
def read_index(d, fn):
|
||||
fp = os.path.join(d, fn)
|
||||
if not os.path.isfile(fp):
|
||||
return None
|
||||
|
||||
with open(fp) as f:
|
||||
index_data = json.load(f, object_pairs_hook=OrderedDict)
|
||||
|
||||
return index_data
|
||||
|
||||
# def add_selected_kw_index(d, fn, kw):
|
||||
# fp = os.path.join(d, fn)
|
||||
# if not os.path.isfile(fp):
|
||||
# return False
|
||||
|
||||
# with open(fp) as f:
|
||||
# index_data = json.load(f)
|
||||
|
||||
# if kw not in index_data['orphan']:
|
||||
# return False
|
||||
|
||||
# v = index_data['orphan'].pop(kw)
|
||||
# if kw not in index_data['selected']:
|
||||
# index_data['selected'][kw] = []
|
||||
|
||||
# index_data['selected'][kw] += v
|
||||
|
||||
# with open(fp, 'w') as fout:
|
||||
# json.dump(index_data, fout, indent=4, sort_keys=True, ensure_ascii=False)
|
||||
|
||||
# return True
|
||||
|
||||
|
||||
def modify_selected_kw_index(d, fn, kw, action="add"):
|
||||
fp = os.path.join(d, fn)
|
||||
if not os.path.isfile(fp):
|
||||
return False
|
||||
|
||||
with open(fp) as f:
|
||||
index_data = json.load(f)
|
||||
|
||||
if action == 'add':
|
||||
in_dic = index_data['selected']
|
||||
out_dic = index_data['orphan']
|
||||
elif action == 'delete':
|
||||
out_dic = index_data['selected']
|
||||
in_dic = index_data['orphan']
|
||||
else:
|
||||
return False
|
||||
|
||||
if kw not in out_dic:
|
||||
return False
|
||||
|
||||
v = out_dic.pop(kw)
|
||||
if kw not in in_dic:
|
||||
in_dic[kw] = []
|
||||
|
||||
in_dic[kw] += v
|
||||
|
||||
with open(fp, 'w') as fout:
|
||||
json.dump(index_data, fout, indent=4, sort_keys=True, ensure_ascii=False)
|
||||
|
||||
return True
|
||||
|
||||
|
||||
@app.route('/index', methods = ['GET'])
|
||||
def index():
|
||||
if request.method == 'GET':
|
||||
li = list_all(config.index['path'], 'js')
|
||||
li = sorted(li, key=lambda x: int(x.split('.')[0]))
|
||||
return render_template("list_files_all.html", title="INDEX [all]", prefix="/index/", files=li)
|
||||
|
||||
@app.route('/index/<path:fn>', methods = ['GET', 'POST'])
|
||||
def indexfn(fn):
|
||||
if request.method == 'GET':
|
||||
data = read_index(config.index['path'], fn)
|
||||
if data is not None:
|
||||
return render_template("indx.html", fn=fn, selected=data['selected'], orphan=data['orphan'])
|
||||
else:
|
||||
return "File: " + fn + "does not exist."
|
||||
elif request.method == 'POST':
|
||||
data = request.form
|
||||
a = data.get('action')
|
||||
if a == "add":
|
||||
logging.info("POST ADD " + fn + " -- " + data.get('kw') + " ++ " + data.get('list'))
|
||||
if modify_selected_kw_index(config.index['path'], fn, data.get('kw')):
|
||||
return "ok"
|
||||
elif a == "delete":
|
||||
logging.info("POST DELETE " + fn + " -- " + data.get('kw') + " ++ " + data.get('list'))
|
||||
if modify_selected_kw_index(config.index['path'], fn, data.get('kw'), action="delete"):
|
||||
return "ok"
|
||||
return "-"
|
||||
|
||||
|
||||
'''
|
||||
XML
|
||||
'''
|
||||
|
||||
def read_xml(d, fn):
|
||||
fp = os.path.join(d, fn)
|
||||
if not os.path.isfile(fp):
|
||||
@ -97,17 +204,12 @@ def delete_nbr_xml(d, fn, nbr, date):
|
||||
tr.write(fp)
|
||||
return True
|
||||
|
||||
|
||||
@app.route('/')
|
||||
def index():
|
||||
return render_template("index.html")
|
||||
|
||||
@app.route('/xml', methods = ['GET', 'POST'])
|
||||
@app.route('/xml', methods = ['GET'])
|
||||
def xml():
|
||||
if request.method == 'GET':
|
||||
li = list_all(config.xml['path'], 'xml')
|
||||
li = sorted(li, key=lambda x: int(x.split('.')[0]))
|
||||
return render_template("xml_all.html", files=li)
|
||||
return render_template("list_files_all.html", title="XML [all]", prefix="/xml/", files=li)
|
||||
|
||||
@app.route('/xml/<path:fn>', methods = ['GET', 'POST'])
|
||||
def xmlfn(fn):
|
||||
@ -129,6 +231,3 @@ def xmlfn(fn):
|
||||
if delete_nbr_xml(config.xml['path'], fn, data.get('nbr'), data.get('date')):
|
||||
return "ok"
|
||||
return "-"
|
||||
|
||||
|
||||
|
||||
|
||||
10
www/static/indx.js
Normal file
10
www/static/indx.js
Normal file
@ -0,0 +1,10 @@
|
||||
$(document).ready(function(){
|
||||
$('.add, .delete').click(function(e) {
|
||||
var li = $(this).parent("li");
|
||||
$.post('/index/' + li.data("file"), {'action': $(this).attr('class'), 'kw': li.data("kw"), 'list': li.data("list")}, function(d) {
|
||||
if(d === 'ok') {
|
||||
location.reload();
|
||||
}
|
||||
});
|
||||
});
|
||||
});
|
||||
28
www/templates/indx.html
Normal file
28
www/templates/indx.html
Normal file
@ -0,0 +1,28 @@
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>{{fn}}</title>
|
||||
<script type="text/javascript" src="{{ url_for('static',filename='jquery-3.2.1.min.js') }}" charset="utf-8"></script>
|
||||
<script type="text/javascript" src="{{ url_for('static',filename='indx.js') }}"></script>
|
||||
</head>
|
||||
<body>
|
||||
<h1>{{fn}}</h1>
|
||||
<div id="all">
|
||||
<h2>Selected</h2>
|
||||
<ul>
|
||||
{% for kw, s in selected.items() %}
|
||||
<li data-file="{{fn}}" data-kw="{{kw}}" data-list="selected">{{kw}} {% for ss in s %} - {{ss}} {% endfor %}<button class="delete">-</button></li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
<hr>
|
||||
<hr>
|
||||
<hr>
|
||||
<h2>Orphans</h2>
|
||||
<ul>
|
||||
{% for kw, s in orphan.items() %}
|
||||
<li data-file="{{fn}}" data-kw="{{kw}}" data-list="orphan">{{kw}} {% for ss in s %} - {{ss}} {% endfor %}<button class="add">+</button></li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
@ -1,14 +1,14 @@
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>XML [all]</title>
|
||||
<title>{{title}}</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>XML [all]</h1>
|
||||
<h1>{{title}}</h1>
|
||||
<div id="all">
|
||||
<ul>
|
||||
{% for f in files %}
|
||||
<li><a href="/xml/{{f}}">{{f}}</a></li>
|
||||
<li><a href="{{prefix}}{{f}}">{{f}}</a></li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
</div>
|
||||
Loading…
x
Reference in New Issue
Block a user