python中scikit-learn机器代码实例


Posted in Python onAugust 05, 2018

我们给大家带来了关于学习python中scikit-learn机器代码的相关具体实例,以下就是全部代码内容:

# -*- coding: utf-8 -*-
 
import numpy
from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn import linear_model
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn import cross_validation
from sklearn import preprocessing
#import iris_data
 
def load_data():
  iris = load_iris()
  x, y = iris.data, iris.target
  x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)
  return x_train,y_train,x_test,y_test
 
def train_clf3(train_data, train_tags):
  clf = LinearSVC(C=1100.0)#default with 'rbf' 
  clf.fit(train_data,train_tags)
  return clf
 
def train_clf(train_data, train_tags):
  clf = MultinomialNB(alpha=0.01)
  print numpy.asarray(train_tags)
  clf.fit(train_data, numpy.asarray(train_tags))
  return clf
 
def evaluate(actual, pred):
  m_precision = metrics.precision_score(actual, pred)
  m_recall = metrics.recall_score(actual, pred)
  print 'precision:{0:.3f}'.format(m_precision)
  print 'recall:{0:0.3f}'.format(m_recall)
  print 'f1-score:{0:.8f}'.format(metrics.f1_score(actual,pred));
 
x_train,y_train,x_test,y_test = load_data()
 
clf = train_clf(x_train, y_train)
 
pred = clf.predict(x_test)
evaluate(numpy.asarray(y_test), pred)
print metrics.classification_report(y_test, pred)
 
 
使用自定义数据
# coding: utf-8
 
import numpy
from sklearn import metrics
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
import codecs
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation
from sklearn import linear_model
 
train_corpus = [
   '我们 我们 好孩子 认证 。 就是',
   '我们 好孩子 认证 。 中国',
   '我们 好孩子 认证 。 孤独',
   '我们 好孩子 认证 。',
 ]
 
test_corpus = [
   '我 菲律宾 韩国',
   '我们 好孩子 认证 。 中国',
 ]
 
def input_data(train_file, test_file):
  train_words = []
  train_tags = []
  test_words = []
  test_tags = []
  f1 = codecs.open(train_file,'r','utf-8','ignore')
  for line in f1:
    tks = line.split(':', 1)
    word_list = tks[1]
    word_array = word_list[1:(len(word_list)-3)].split(", ")
    train_words.append(" ".join(word_array))
    train_tags.append(tks[0])
  f2 = codecs.open(test_file,'r','utf-8','ignore')
  for line in f2:
    tks = line.split(':', 1)
    word_list = tks[1]
    word_array = word_list[1:(len(word_list)-3)].split(", ")
    test_words.append(" ".join(word_array))
    test_tags.append(tks[0])
  return train_words, train_tags, test_words, test_tags
 
 
def vectorize(train_words, test_words):
  #v = HashingVectorizer(n_features=25000, non_negative=True)
  v = HashingVectorizer(non_negative=True)
  #v = CountVectorizer(min_df=1)
  train_data = v.fit_transform(train_words)
  test_data = v.fit_transform(test_words)
  return train_data, test_data
 
def vectorize1(train_words, test_words):
  tv = TfidfVectorizer(sublinear_tf = False,use_idf=True);
  train_data = tv.fit_transform(train_words);
  tv2 = TfidfVectorizer(vocabulary = tv.vocabulary_);
  test_data = tv2.fit_transform(test_words);
  return train_data, test_data
  
def vectorize2(train_words, test_words):
  count_v1= CountVectorizer(stop_words = 'english', max_df = 0.5); 
  counts_train = count_v1.fit_transform(train_words); 
   
  count_v2 = CountVectorizer(vocabulary=count_v1.vocabulary_);
  counts_test = count_v2.fit_transform(test_words);
   
  tfidftransformer = TfidfTransformer();
   
  train_data = tfidftransformer.fit(counts_train).transform(counts_train); 
  test_data = tfidftransformer.fit(counts_test).transform(counts_test);
  return train_data, test_data
 
def evaluate(actual, pred):
  m_precision = metrics.precision_score(actual, pred)
  m_recall = metrics.recall_score(actual, pred)
  print 'precision:{0:.3f}'.format(m_precision)
  print 'recall:{0:0.3f}'.format(m_recall)
  print 'f1-score:{0:.8f}'.format(metrics.f1_score(actual,pred));
 
 
def train_clf(train_data, train_tags):
  clf = MultinomialNB(alpha=0.01)
  clf.fit(train_data, numpy.asarray(train_tags))
  return clf
 
 
def train_clf1(train_data, train_tags):
  #KNN Classifier
  clf = KNeighborsClassifier()#default with k=5 
  clf.fit(train_data, numpy.asarray(train_tags)) 
  return clf
 
def train_clf2(train_data, train_tags):
  clf = linear_model.LogisticRegression(C=1e5) 
  clf.fit(train_data,train_tags)
  return clf
 
def train_clf3(train_data, train_tags):
  clf = LinearSVC(C=1100.0)#default with 'rbf' 
  clf.fit(train_data,train_tags)
  return clf
 
def train_clf4(train_data, train_tags):
  """
  随机森林,不可使用稀疏矩阵
  """
  clf = RandomForestClassifier(n_estimators=10)
  clf.fit(train_data.todense(),train_tags)
  return clf
 
#使用codecs逐行读取
def codecs_read_label_line(filename):
  label_list=[]
  f = codecs.open(filename,'r','utf-8','ignore')
  line = f.readline()
  while line:
    #label_list.append(line[0:len(line)-2])
    label_list.append(line[0:len(line)-1])
    line = f.readline()
  f.close()
  return label_list
 
def save_test_features(test_url, test_label):
  test_feature_list = codecs_read_label_line('test.dat')
  fw = open('test_labeded.dat',"w+")
  
  for (url,label) in zip(test_feature_list,test_label):
    fw.write(url+'\t'+label)
    fw.write('\n')
  fw.close()
 
def main():
  train_file = u'..\\file\\py_train.txt'
  test_file = u'..\\file\\py_test.txt'
  train_words, train_tags, test_words, test_tags = input_data(train_file, test_file)
  #print len(train_words), len(train_tags), len(test_words), len(test_words), 
  
  train_data, test_data = vectorize1(train_words, test_words)
  print type(train_data)
  print train_data.shape
  print test_data.shape
  print test_data[0].shape
  print numpy.asarray(test_data[0])
  
  clf = train_clf3(train_data, train_tags)
  
  scores = cross_validation.cross_val_score(
  clf, train_data, train_tags, cv=5, scoring="f1_weighted")
  print scores
 
  #predicted = cross_validation.cross_val_predict(clf, train_data,train_tags, cv=5)  
  '''
  
  '''
  pred = clf.predict(test_data)
  error_list=[]
  for (true_tag,predict_tag) in zip(test_tags,pred):
    if true_tag != predict_tag:
      print true_tag,predict_tag
      error_list.append(true_tag+' '+predict_tag)
  print len(error_list)
  evaluate(numpy.asarray(test_tags), pred)
  '''
  #输出打标签结果
  test_feature_list = codecs_read_label_line('test.dat')
  save_test_features(test_feature_list, pred)
  '''
  
 
if __name__ == '__main__':
  main()
Python 相关文章推荐
python将MongoDB里的ObjectId转换为时间戳的方法
Mar 13 Python
Pyhton中单行和多行注释的使用方法及规范
Oct 11 Python
python实现k-means聚类算法
Feb 23 Python
Python将list中的string批量转化成int/float的方法
Jun 26 Python
详解用python生成随机数的几种方法
Aug 04 Python
Python3.7 pyodbc完美配置访问access数据库
Oct 03 Python
Python多重继承之菱形继承的实例详解
Feb 12 Python
Python 实现网课实时监控自动签到、打卡功能
Mar 12 Python
Python使用jpype模块调用jar包过程解析
Jul 29 Python
Python实现随机爬山算法
Jan 29 Python
django注册用邮箱发送验证码的实现
Apr 18 Python
Python基础之元组与文件知识总结
May 19 Python
解决使用pycharm提交代码时冲突之后文件丢失找回的方法
Aug 05 #Python
Python字符串、整数、和浮点型数相互转换实例
Aug 04 #Python
python与caffe改变通道顺序的方法
Aug 04 #Python
Python爬虫PyQuery库基本用法入门教程
Aug 04 #Python
python list转矩阵的实例讲解
Aug 04 #Python
Python 生成 -1~1 之间的随机数矩阵方法
Aug 04 #Python
Python爬虫框架scrapy实现downloader_middleware设置proxy代理功能示例
Aug 04 #Python
You might like
PHP远程连接MYSQL数据库非常慢的解决方法
2008/07/05 PHP
分享一个超好用的php header下载函数
2014/01/31 PHP
thinkphp模板的包含与渲染实例分析
2014/11/26 PHP
PHP接收App端发送文件流的方法
2016/09/23 PHP
phpMyAdmin无法登陆的解决方法
2017/04/27 PHP
PHP创建XML的方法示例【基于DOMDocument类及SimpleXMLElement类】
2019/09/10 PHP
使用jQuery和PHP实现类似360功能开关效果
2014/02/12 Javascript
jquery等待效果示例
2014/05/01 Javascript
nodejs实现黑名单中间件设计
2014/06/17 NodeJs
JQuery的ON()方法支持的所有事件罗列
2015/02/28 Javascript
jQuery实现放大镜效果实例代码
2016/03/17 Javascript
清除js缓存的多种方法总结
2016/12/09 Javascript
JS正则匹配中文的方法示例
2017/01/06 Javascript
vue2.0 axios前后端数据处理实例代码
2017/06/30 Javascript
JavaScript原型对象、构造函数和实例对象功能与用法详解
2018/08/04 Javascript
Angularjs 根据一个select的值去设置另一个select的值方法
2018/08/13 Javascript
element-ui中select组件绑定值改变,触发change事件方法
2018/08/24 Javascript
Vue中的Props(不可变状态)
2018/09/29 Javascript
微信小程序新手教程之页面打开数量限制
2019/03/03 Javascript
原生js canvas实现鼠标跟随效果
2020/08/02 Javascript
实例讲解python函数式编程
2014/06/09 Python
Python中SOAP项目的介绍及其在web开发中的应用
2015/04/14 Python
Python中IPYTHON入门实例
2015/05/11 Python
Django查找网站项目根目录和对正则表达式的支持
2015/07/15 Python
Python 正则表达式入门(初级篇)
2016/12/07 Python
Python机器学习k-近邻算法(K Nearest Neighbor)实例详解
2018/06/25 Python
python登录WeChat 实现自动回复实例详解
2019/05/28 Python
python爬虫增加访问量的方法
2019/08/22 Python
基于python判断目录或者文件代码实例
2019/11/29 Python
奥地利顶级内衣丝袜品牌英国站:Wolford英国
2016/08/29 全球购物
德国高性价比网上药店:medpex
2017/07/09 全球购物
MAC彩妆英国官网:M·A·C UK
2018/05/30 全球购物
英国在线汽车和面包车零件商店:Car Parts 4 Less
2018/08/15 全球购物
Belvilla法国:休闲度假房屋出租
2020/10/03 全球购物
企业党建工作汇报材料
2014/08/19 职场文书
活动主持人开场白
2015/05/28 职场文书