系列文章目录
Python深度学习-NLP实战:深度学习系列培训教程
- Linux服务器软件的简单使用
- Linux命令的简单使用
- 训练集、测试集、验证集的介绍及制作
- 字向量的训练与生成
- 文本分类之SVM模型的简单使用
- 文本分类之FASTTEXT实现中文文本分类
- 命名实体识别(NER)之分词与序列标注、实体识别任务简介
前言
最近在做关于食品安全事件新闻的分类工作,我主要使用实现中文文本分类,下表是模型测试的效果,感觉不错!
一、下载FastText
1.下载FastText第三方库
( 如果有问题就下载源码)
2.下载FastText源码
FastText源码提取地址
其实只需要安装包里的FastText.py文件就可以了,里面封装了类。
大家也可以不用下载,直接复制下面代码在工程目录下保存为FastText.py即可。
(篇幅较长,下载安装包的朋友直接略过…)
# Copyright (c) 2017-present, Facebook, Inc.# All rights reserved.## This source code is licensed under the MIT license found in the# LICENSE file in the root directory of this source tree.from __future__ import absolute_importfrom __future__ import divisionfrom __future__ import print_functionfrom __future__ import unicode_literalsimport fasttext_pybind as fasttextimport numpy as npimport multiprocessingimport sysfrom itertools import chainloss_name = fasttext.loss_namemodel_name = fasttext.model_nameEOS = ""BOW = "EOW = ">"displayed_errors = {}def eprint(*args, **kwargs): print(*args, file=sys.stderr, **kwargs)class _Meter(object): def __init__(self, fasttext_model, meter):self.f = fasttext_modelself.m = meter def score_vs_true(self, label):"""Return scores and the gold of each sample for a specific label"""label_id = self.f.get_label_id(label)pair_list = self.m.scoreVsTrue(label_id)if pair_list: y_scores, y_true = zip(*pair_list)else: y_scores, y_true = ([], ())return np.array(y_scores, copy=False), np.array(y_true, copy=False) def precision_recall_curve(self, label=None):"""Return precision/recall curve"""if label: label_id = self.f.get_label_id(label) pair_list = self.m.precisionRecallCurveLabel(label_id)else: pair_list = self.m.precisionRecallCurve()if pair_list: precision, recall = zip(*pair_list)else: precision, recall = ([], ())return np.array(precision, copy=False), np.array(recall, copy=False) def precision_at_recall(self, recall, label=None):"""Return precision for a given recall"""if label: label_id = self.f.get_label_id(label) precision = self.m.precisionAtRecallLabel(label_id, recall)else: precision = self.m.precisionAtRecall(recall)return precision def recall_at_precision(self, precision, label=None):"""Return recall for a given precision"""if label: label_id = self.f.get_label_id(label) recall = self.m.recallAtPrecisionLabel(label_id, precision)else: recall = self.m.recallAtPrecision(precision)return recallclass _FastText(object): """ This class defines the API to inspect models and should not be used to create objects. It will be returned by functions such as load_model or train. In general this API assumes to be given only unicode for Python2 and the Python3 equvalent called str for any string-like arguments. All unicode strings are then encoded as UTF-8 and fed to the fastText C++ API. """ def __init__(self, model_path=None, args=None):self.f = fasttext.fasttext()if model_path is not None: self.f.loadModel(model_path)self._words = Noneself._labels = Noneself.set_args(args) def set_args(self, args=None):if args: arg_names = ['lr', 'dim', 'ws', 'epoch', 'minCount', 'minCountLabel', 'minn', 'maxn', 'neg', 'wordNgrams', 'loss', 'bucket', 'thread', 'lrUpdateRate', 't', 'label', 'verbose', 'pretrainedVectors'] for arg_name in arg_names: setattr(self, arg_name, getattr(args, arg_name)) def is_quantized(self):return self.f.isQuant() def get_dimension(self):"""Get the dimension (size) of a lookup vector (hidden layer)."""a = self.f.getArgs()return a.dim def get_word_vector(self, word):"""Get the vector representation of word."""dim = self.get_dimension()b = fasttext.Vector(dim)self.f.getWordVector(b, word)return np.array(b) def get_sentence_vector(self, text):"""Given a string, get a single vector represenation. This functionassumes to be given a single line of text. We split words onwhitespace (space, newline, tab, vertical tab) and the controlcharacters carriage return, formfeed and the null character."""if text.find('n') != -1: raise ValueError( "predict processes one line at a time (remove '\n')" )text += "n"dim = self.get_dimension()b = fasttext.Vector(dim)self.f.getSentenceVector(b, text)return np.array(b) def get_nearest_neighbors(self, word, k=10, on_unicode_error='strict'):return self.f.getNN(word, k, on_unicode_error) def get_analogies(self, wordA, wordB, wordC, k=10, on_unicode_error='strict'):return self.f.getAnalogies(wordA, wordB, wordC, k, on_unicode_error) def get_word_id(self, word):"""Given a word, get the word id within the dictionary.Returns -1 if word is not in the dictionary."""return self.f.getWordId(word) def get_label_id(self, label来源:南浔Pyer
声明:本站部分文章及图片转载于互联网,内容版权归原作者所有,如本站任何资料有侵权请您尽早请联系jinwei@zod.com.cn进行处理,非常感谢!