Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python | |
| # -*- coding: utf-8 -*- | |
| from __future__ import absolute_import, unicode_literals | |
| import sys | |
| from operator import itemgetter | |
| from collections import defaultdict | |
| import jieba.posseg | |
| from .tfidf import KeywordExtractor | |
| from .._compat import * | |
| class UndirectWeightedGraph: | |
| d = 0.85 | |
| def __init__(self): | |
| self.graph = defaultdict(list) | |
| def addEdge(self, start, end, weight): | |
| # use a tuple (start, end, weight) instead of a Edge object | |
| self.graph[start].append((start, end, weight)) | |
| self.graph[end].append((end, start, weight)) | |
| def rank(self): | |
| ws = defaultdict(float) | |
| outSum = defaultdict(float) | |
| wsdef = 1.0 / (len(self.graph) or 1.0) | |
| for n, out in self.graph.items(): | |
| ws[n] = wsdef | |
| outSum[n] = sum((e[2] for e in out), 0.0) | |
| # this line for build stable iteration | |
| sorted_keys = sorted(self.graph.keys()) | |
| for x in xrange(10): # 10 iters | |
| for n in sorted_keys: | |
| s = 0 | |
| for e in self.graph[n]: | |
| s += e[2] / outSum[e[1]] * ws[e[1]] | |
| ws[n] = (1 - self.d) + self.d * s | |
| (min_rank, max_rank) = (sys.float_info[0], sys.float_info[3]) | |
| for w in itervalues(ws): | |
| if w < min_rank: | |
| min_rank = w | |
| if w > max_rank: | |
| max_rank = w | |
| for n, w in ws.items(): | |
| # to unify the weights, don't *100. | |
| ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0) | |
| return ws | |
| class TextRank(KeywordExtractor): | |
| def __init__(self): | |
| self.tokenizer = self.postokenizer = jieba.posseg.dt | |
| self.stop_words = self.STOP_WORDS.copy() | |
| self.pos_filt = frozenset(('ns', 'n', 'vn', 'v')) | |
| self.span = 5 | |
| def pairfilter(self, wp): | |
| return (wp.flag in self.pos_filt and len(wp.word.strip()) >= 2 | |
| and wp.word.lower() not in self.stop_words) | |
| def textrank(self, sentence, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'), withFlag=False): | |
| """ | |
| Extract keywords from sentence using TextRank algorithm. | |
| Parameter: | |
| - topK: return how many top keywords. `None` for all possible words. | |
| - withWeight: if True, return a list of (word, weight); | |
| if False, return a list of words. | |
| - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v']. | |
| if the POS of w is not in this list, it will be filtered. | |
| - withFlag: if True, return a list of pair(word, weight) like posseg.cut | |
| if False, return a list of words | |
| """ | |
| self.pos_filt = frozenset(allowPOS) | |
| g = UndirectWeightedGraph() | |
| cm = defaultdict(int) | |
| words = tuple(self.tokenizer.cut(sentence)) | |
| for i, wp in enumerate(words): | |
| if self.pairfilter(wp): | |
| for j in xrange(i + 1, i + self.span): | |
| if j >= len(words): | |
| break | |
| if not self.pairfilter(words[j]): | |
| continue | |
| if allowPOS and withFlag: | |
| cm[(wp, words[j])] += 1 | |
| else: | |
| cm[(wp.word, words[j].word)] += 1 | |
| for terms, w in cm.items(): | |
| g.addEdge(terms[0], terms[1], w) | |
| nodes_rank = g.rank() | |
| if withWeight: | |
| tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True) | |
| else: | |
| tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True) | |
| if topK: | |
| return tags[:topK] | |
| else: | |
| return tags | |
| extract_tags = textrank | |