Spaces:
Runtime error
Runtime error
| from __future__ import absolute_import, unicode_literals | |
| __version__ = '0.38' | |
| __license__ = 'MIT' | |
| import re | |
| import os | |
| import sys | |
| import time | |
| import logging | |
| import marshal | |
| import tempfile | |
| import threading | |
| from math import log | |
| from hashlib import md5 | |
| from ._compat import * | |
| from . import finalseg | |
| if os.name == 'nt': | |
| from shutil import move as _replace_file | |
| else: | |
| _replace_file = os.rename | |
| _get_abs_path = lambda path: os.path.normpath(os.path.join(os.getcwd(), path)) | |
| DEFAULT_DICT = None | |
| DEFAULT_DICT_NAME = "dict.txt" | |
| log_console = logging.StreamHandler(sys.stderr) | |
| default_logger = logging.getLogger(__name__) | |
| default_logger.setLevel(logging.DEBUG) | |
| default_logger.addHandler(log_console) | |
| DICT_WRITING = {} | |
| pool = None | |
| re_userdict = re.compile('^(.+?)( [0-9]+)?( [a-z]+)?$', re.U) | |
| re_eng = re.compile('[a-zA-Z0-9]', re.U) | |
| # \u4E00-\u9FD5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han | |
| # \r\n|\s : whitespace characters. Will not be handled. | |
| re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._]+)", re.U) | |
| re_skip_default = re.compile("(\r\n|\s)", re.U) | |
| re_han_cut_all = re.compile("([\u4E00-\u9FD5]+)", re.U) | |
| re_skip_cut_all = re.compile("[^a-zA-Z0-9+#\n]", re.U) | |
| def setLogLevel(log_level): | |
| global logger | |
| default_logger.setLevel(log_level) | |
| class Tokenizer(object): | |
| def __init__(self, dictionary=DEFAULT_DICT): | |
| self.lock = threading.RLock() | |
| if dictionary == DEFAULT_DICT: | |
| self.dictionary = dictionary | |
| else: | |
| self.dictionary = _get_abs_path(dictionary) | |
| self.FREQ = {} | |
| self.total = 0 | |
| self.user_word_tag_tab = {} | |
| self.initialized = False | |
| self.tmp_dir = None | |
| self.cache_file = None | |
| def __repr__(self): | |
| return '<Tokenizer dictionary=%r>' % self.dictionary | |
| def gen_pfdict(self, f): | |
| lfreq = {} | |
| ltotal = 0 | |
| f_name = resolve_filename(f) | |
| for lineno, line in enumerate(f, 1): | |
| try: | |
| line = line.strip().decode('utf-8') | |
| word, freq = line.split(' ')[:2] | |
| freq = int(freq) | |
| lfreq[word] = freq | |
| ltotal += freq | |
| for ch in xrange(len(word)): | |
| wfrag = word[:ch + 1] | |
| if wfrag not in lfreq: | |
| lfreq[wfrag] = 0 | |
| except ValueError: | |
| raise ValueError( | |
| 'invalid dictionary entry in %s at Line %s: %s' % (f_name, lineno, line)) | |
| f.close() | |
| return lfreq, ltotal | |
| def initialize(self, dictionary=None): | |
| if dictionary: | |
| abs_path = _get_abs_path(dictionary) | |
| if self.dictionary == abs_path and self.initialized: | |
| return | |
| else: | |
| self.dictionary = abs_path | |
| self.initialized = False | |
| else: | |
| abs_path = self.dictionary | |
| with self.lock: | |
| try: | |
| with DICT_WRITING[abs_path]: | |
| pass | |
| except KeyError: | |
| pass | |
| if self.initialized: | |
| return | |
| default_logger.debug("Building prefix dict from %s ..." % (abs_path or 'the default dictionary')) | |
| t1 = time.time() | |
| if self.cache_file: | |
| cache_file = self.cache_file | |
| # default dictionary | |
| elif abs_path == DEFAULT_DICT: | |
| cache_file = "jieba.cache" | |
| # custom dictionary | |
| else: | |
| cache_file = "jieba.u%s.cache" % md5( | |
| abs_path.encode('utf-8', 'replace')).hexdigest() | |
| cache_file = os.path.join( | |
| self.tmp_dir or tempfile.gettempdir(), cache_file) | |
| # prevent absolute path in self.cache_file | |
| tmpdir = os.path.dirname(cache_file) | |
| load_from_cache_fail = True | |
| if os.path.isfile(cache_file) and (abs_path == DEFAULT_DICT or | |
| os.path.getmtime(cache_file) > os.path.getmtime(abs_path)): | |
| default_logger.debug( | |
| "Loading model from cache %s" % cache_file) | |
| try: | |
| with open(cache_file, 'rb') as cf: | |
| self.FREQ, self.total = marshal.load(cf) | |
| load_from_cache_fail = False | |
| except Exception: | |
| load_from_cache_fail = True | |
| if load_from_cache_fail: | |
| wlock = DICT_WRITING.get(abs_path, threading.RLock()) | |
| DICT_WRITING[abs_path] = wlock | |
| with wlock: | |
| self.FREQ, self.total = self.gen_pfdict(self.get_dict_file()) | |
| default_logger.debug( | |
| "Dumping model to file cache %s" % cache_file) | |
| try: | |
| # prevent moving across different filesystems | |
| fd, fpath = tempfile.mkstemp(dir=tmpdir) | |
| with os.fdopen(fd, 'wb') as temp_cache_file: | |
| marshal.dump( | |
| (self.FREQ, self.total), temp_cache_file) | |
| _replace_file(fpath, cache_file) | |
| except Exception: | |
| default_logger.exception("Dump cache file failed.") | |
| try: | |
| del DICT_WRITING[abs_path] | |
| except KeyError: | |
| pass | |
| self.initialized = True | |
| default_logger.debug( | |
| "Loading model cost %.3f seconds." % (time.time() - t1)) | |
| default_logger.debug("Prefix dict has been built succesfully.") | |
| def check_initialized(self): | |
| if not self.initialized: | |
| self.initialize() | |
| def calc(self, sentence, DAG, route): | |
| N = len(sentence) | |
| route[N] = (0, 0) | |
| logtotal = log(self.total) | |
| for idx in xrange(N - 1, -1, -1): | |
| route[idx] = max((log(self.FREQ.get(sentence[idx:x + 1]) or 1) - | |
| logtotal + route[x + 1][0], x) for x in DAG[idx]) | |
| def get_DAG(self, sentence): | |
| self.check_initialized() | |
| DAG = {} | |
| N = len(sentence) | |
| for k in xrange(N): | |
| tmplist = [] | |
| i = k | |
| frag = sentence[k] | |
| while i < N and frag in self.FREQ: | |
| if self.FREQ[frag]: | |
| tmplist.append(i) | |
| i += 1 | |
| frag = sentence[k:i + 1] | |
| if not tmplist: | |
| tmplist.append(k) | |
| DAG[k] = tmplist | |
| return DAG | |
| def __cut_all(self, sentence): | |
| dag = self.get_DAG(sentence) | |
| old_j = -1 | |
| for k, L in iteritems(dag): | |
| if len(L) == 1 and k > old_j: | |
| yield sentence[k:L[0] + 1] | |
| old_j = L[0] | |
| else: | |
| for j in L: | |
| if j > k: | |
| yield sentence[k:j + 1] | |
| old_j = j | |
| def __cut_DAG_NO_HMM(self, sentence): | |
| DAG = self.get_DAG(sentence) | |
| route = {} | |
| self.calc(sentence, DAG, route) | |
| x = 0 | |
| N = len(sentence) | |
| buf = '' | |
| while x < N: | |
| y = route[x][1] + 1 | |
| l_word = sentence[x:y] | |
| if re_eng.match(l_word) and len(l_word) == 1: | |
| buf += l_word | |
| x = y | |
| else: | |
| if buf: | |
| yield buf | |
| buf = '' | |
| yield l_word | |
| x = y | |
| if buf: | |
| yield buf | |
| buf = '' | |
| def __cut_DAG(self, sentence): | |
| DAG = self.get_DAG(sentence) | |
| route = {} | |
| self.calc(sentence, DAG, route) | |
| x = 0 | |
| buf = '' | |
| N = len(sentence) | |
| while x < N: | |
| y = route[x][1] + 1 | |
| l_word = sentence[x:y] | |
| if y - x == 1: | |
| buf += l_word | |
| else: | |
| if buf: | |
| if len(buf) == 1: | |
| yield buf | |
| buf = '' | |
| else: | |
| if not self.FREQ.get(buf): | |
| recognized = finalseg.cut(buf) | |
| for t in recognized: | |
| yield t | |
| else: | |
| for elem in buf: | |
| yield elem | |
| buf = '' | |
| yield l_word | |
| x = y | |
| if buf: | |
| if len(buf) == 1: | |
| yield buf | |
| elif not self.FREQ.get(buf): | |
| recognized = finalseg.cut(buf) | |
| for t in recognized: | |
| yield t | |
| else: | |
| for elem in buf: | |
| yield elem | |
| def cut(self, sentence, cut_all=False, HMM=True): | |
| ''' | |
| The main function that segments an entire sentence that contains | |
| Chinese characters into seperated words. | |
| Parameter: | |
| - sentence: The str(unicode) to be segmented. | |
| - cut_all: Model type. True for full pattern, False for accurate pattern. | |
| - HMM: Whether to use the Hidden Markov Model. | |
| ''' | |
| sentence = strdecode(sentence) | |
| if cut_all: | |
| re_han = re_han_cut_all | |
| re_skip = re_skip_cut_all | |
| else: | |
| re_han = re_han_default | |
| re_skip = re_skip_default | |
| if cut_all: | |
| cut_block = self.__cut_all | |
| elif HMM: | |
| cut_block = self.__cut_DAG | |
| else: | |
| cut_block = self.__cut_DAG_NO_HMM | |
| blocks = re_han.split(sentence) | |
| for blk in blocks: | |
| if not blk: | |
| continue | |
| if re_han.match(blk): | |
| for word in cut_block(blk): | |
| yield word | |
| else: | |
| tmp = re_skip.split(blk) | |
| for x in tmp: | |
| if re_skip.match(x): | |
| yield x | |
| elif not cut_all: | |
| for xx in x: | |
| yield xx | |
| else: | |
| yield x | |
| def cut_for_search(self, sentence, HMM=True): | |
| """ | |
| Finer segmentation for search engines. | |
| """ | |
| words = self.cut(sentence, HMM=HMM) | |
| for w in words: | |
| if len(w) > 2: | |
| for i in xrange(len(w) - 1): | |
| gram2 = w[i:i + 2] | |
| if self.FREQ.get(gram2): | |
| yield gram2 | |
| if len(w) > 3: | |
| for i in xrange(len(w) - 2): | |
| gram3 = w[i:i + 3] | |
| if self.FREQ.get(gram3): | |
| yield gram3 | |
| yield w | |
| def lcut(self, *args, **kwargs): | |
| return list(self.cut(*args, **kwargs)) | |
| def lcut_for_search(self, *args, **kwargs): | |
| return list(self.cut_for_search(*args, **kwargs)) | |
| _lcut = lcut | |
| _lcut_for_search = lcut_for_search | |
| def _lcut_no_hmm(self, sentence): | |
| return self.lcut(sentence, False, False) | |
| def _lcut_all(self, sentence): | |
| return self.lcut(sentence, True) | |
| def _lcut_for_search_no_hmm(self, sentence): | |
| return self.lcut_for_search(sentence, False) | |
| def get_dict_file(self): | |
| if self.dictionary == DEFAULT_DICT: | |
| return get_module_res(DEFAULT_DICT_NAME) | |
| else: | |
| return open(self.dictionary, 'rb') | |
| def load_userdict(self, f): | |
| ''' | |
| Load personalized dict to improve detect rate. | |
| Parameter: | |
| - f : A plain text file contains words and their ocurrences. | |
| Can be a file-like object, or the path of the dictionary file, | |
| whose encoding must be utf-8. | |
| Structure of dict file: | |
| word1 freq1 word_type1 | |
| word2 freq2 word_type2 | |
| ... | |
| Word type may be ignored | |
| ''' | |
| self.check_initialized() | |
| if isinstance(f, string_types): | |
| f_name = f | |
| f = open(f, 'rb') | |
| else: | |
| f_name = resolve_filename(f) | |
| for lineno, ln in enumerate(f, 1): | |
| line = ln.strip() | |
| if not isinstance(line, text_type): | |
| try: | |
| line = line.decode('utf-8').lstrip('\ufeff') | |
| except UnicodeDecodeError: | |
| raise ValueError('dictionary file %s must be utf-8' % f_name) | |
| if not line: | |
| continue | |
| # match won't be None because there's at least one character | |
| word, freq, tag = re_userdict.match(line).groups() | |
| if freq is not None: | |
| freq = freq.strip() | |
| if tag is not None: | |
| tag = tag.strip() | |
| self.add_word(word, freq, tag) | |
| def add_word(self, word, freq=None, tag=None): | |
| """ | |
| Add a word to dictionary. | |
| freq and tag can be omitted, freq defaults to be a calculated value | |
| that ensures the word can be cut out. | |
| """ | |
| self.check_initialized() | |
| word = strdecode(word) | |
| freq = int(freq) if freq is not None else self.suggest_freq(word, False) | |
| self.FREQ[word] = freq | |
| self.total += freq | |
| if tag: | |
| self.user_word_tag_tab[word] = tag | |
| for ch in xrange(len(word)): | |
| wfrag = word[:ch + 1] | |
| if wfrag not in self.FREQ: | |
| self.FREQ[wfrag] = 0 | |
| def del_word(self, word): | |
| """ | |
| Convenient function for deleting a word. | |
| """ | |
| self.add_word(word, 0) | |
| def suggest_freq(self, segment, tune=False): | |
| """ | |
| Suggest word frequency to force the characters in a word to be | |
| joined or splitted. | |
| Parameter: | |
| - segment : The segments that the word is expected to be cut into, | |
| If the word should be treated as a whole, use a str. | |
| - tune : If True, tune the word frequency. | |
| Note that HMM may affect the final result. If the result doesn't change, | |
| set HMM=False. | |
| """ | |
| self.check_initialized() | |
| ftotal = float(self.total) | |
| freq = 1 | |
| if isinstance(segment, string_types): | |
| word = segment | |
| for seg in self.cut(word, HMM=False): | |
| freq *= self.FREQ.get(seg, 1) / ftotal | |
| freq = max(int(freq * self.total) + 1, self.FREQ.get(word, 1)) | |
| else: | |
| segment = tuple(map(strdecode, segment)) | |
| word = ''.join(segment) | |
| for seg in segment: | |
| freq *= self.FREQ.get(seg, 1) / ftotal | |
| freq = min(int(freq * self.total), self.FREQ.get(word, 0)) | |
| if tune: | |
| add_word(word, freq) | |
| return freq | |
| def tokenize(self, unicode_sentence, mode="default", HMM=True): | |
| """ | |
| Tokenize a sentence and yields tuples of (word, start, end) | |
| Parameter: | |
| - sentence: the str(unicode) to be segmented. | |
| - mode: "default" or "search", "search" is for finer segmentation. | |
| - HMM: whether to use the Hidden Markov Model. | |
| """ | |
| if not isinstance(unicode_sentence, text_type): | |
| raise ValueError("jieba: the input parameter should be unicode.") | |
| start = 0 | |
| if mode == 'default': | |
| for w in self.cut(unicode_sentence, HMM=HMM): | |
| width = len(w) | |
| yield (w, start, start + width) | |
| start += width | |
| else: | |
| for w in self.cut(unicode_sentence, HMM=HMM): | |
| width = len(w) | |
| if len(w) > 2: | |
| for i in xrange(len(w) - 1): | |
| gram2 = w[i:i + 2] | |
| if self.FREQ.get(gram2): | |
| yield (gram2, start + i, start + i + 2) | |
| if len(w) > 3: | |
| for i in xrange(len(w) - 2): | |
| gram3 = w[i:i + 3] | |
| if self.FREQ.get(gram3): | |
| yield (gram3, start + i, start + i + 3) | |
| yield (w, start, start + width) | |
| start += width | |
| def set_dictionary(self, dictionary_path): | |
| with self.lock: | |
| abs_path = _get_abs_path(dictionary_path) | |
| if not os.path.isfile(abs_path): | |
| raise Exception("jieba: file does not exist: " + abs_path) | |
| self.dictionary = abs_path | |
| self.initialized = False | |
| # default Tokenizer instance | |
| dt = Tokenizer() | |
| # global functions | |
| get_FREQ = lambda k, d=None: dt.FREQ.get(k, d) | |
| add_word = dt.add_word | |
| calc = dt.calc | |
| cut = dt.cut | |
| lcut = dt.lcut | |
| cut_for_search = dt.cut_for_search | |
| lcut_for_search = dt.lcut_for_search | |
| del_word = dt.del_word | |
| get_DAG = dt.get_DAG | |
| get_dict_file = dt.get_dict_file | |
| initialize = dt.initialize | |
| load_userdict = dt.load_userdict | |
| set_dictionary = dt.set_dictionary | |
| suggest_freq = dt.suggest_freq | |
| tokenize = dt.tokenize | |
| user_word_tag_tab = dt.user_word_tag_tab | |
| def _lcut_all(s): | |
| return dt._lcut_all(s) | |
| def _lcut(s): | |
| return dt._lcut(s) | |
| def _lcut_all(s): | |
| return dt._lcut_all(s) | |
| def _lcut_for_search(s): | |
| return dt._lcut_for_search(s) | |
| def _lcut_for_search_no_hmm(s): | |
| return dt._lcut_for_search_no_hmm(s) | |
| def _pcut(sentence, cut_all=False, HMM=True): | |
| parts = strdecode(sentence).splitlines(True) | |
| if cut_all: | |
| result = pool.map(_lcut_all, parts) | |
| elif HMM: | |
| result = pool.map(_lcut, parts) | |
| else: | |
| result = pool.map(_lcut_no_hmm, parts) | |
| for r in result: | |
| for w in r: | |
| yield w | |
| def _pcut_for_search(sentence, HMM=True): | |
| parts = strdecode(sentence).splitlines(True) | |
| if HMM: | |
| result = pool.map(_lcut_for_search, parts) | |
| else: | |
| result = pool.map(_lcut_for_search_no_hmm, parts) | |
| for r in result: | |
| for w in r: | |
| yield w | |
| def enable_parallel(processnum=None): | |
| """ | |
| Change the module's `cut` and `cut_for_search` functions to the | |
| parallel version. | |
| Note that this only works using dt, custom Tokenizer | |
| instances are not supported. | |
| """ | |
| global pool, dt, cut, cut_for_search | |
| from multiprocessing import cpu_count | |
| if os.name == 'nt': | |
| raise NotImplementedError( | |
| "jieba: parallel mode only supports posix system") | |
| else: | |
| from multiprocessing import Pool | |
| dt.check_initialized() | |
| if processnum is None: | |
| processnum = cpu_count() | |
| pool = Pool(processnum) | |
| cut = _pcut | |
| cut_for_search = _pcut_for_search | |
| def disable_parallel(): | |
| global pool, dt, cut, cut_for_search | |
| if pool: | |
| pool.close() | |
| pool = None | |
| cut = dt.cut | |
| cut_for_search = dt.cut_for_search | |