【Mahout二】基于Mahout CBayes算法的20newsgroup的脚本分析
编程技术  /  houtizong 发布于 3年前   137
#!/bin/bash## Licensed to the Apache Software Foundation (ASF) under one or more# contributor license agreements. See the NOTICE file distributed with# this work for additional information regarding copyright ownership.# The ASF licenses this file to You under the Apache License, Version 2.0# (the "License"); you may not use this file except in compliance with# the License. You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.### Downloads the 20newsgroups dataset, trains and tests a classifier.## To run: change into the mahout directory and type:# examples/bin/classify-20newsgroups.sh////支持--help以查看该脚本的功能,本脚本是对经典的20 News Groups运行CBayes分类算法if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then echo "This script runs SGD and Bayes classifiers over the classic 20 News Groups." exitfiSCRIPT_PATH=${0%/*}if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then cd $SCRIPT_PATHfiSTART_PATH=`pwd`# Set commands for dfssource ${START_PATH}/set-dfs-commands.sh////Mahout MapReduce 作业的本地工作目录,WORK_DIR=/tmp/mahout-work-${USER}////本脚本支持的分类算法,这里选择cnaivebayes-MapReduce算法algorithm=( cnaivebayes-MapReduce naivebayes-MapReduce cnaivebayes-Spark naivebayes-Spark sgd clean)/////选择分类算法if [ -n "$1" ]; then choice=$1else echo "Please select a number to choose the corresponding task to run" echo "1. ${algorithm[0]}" echo "2. ${algorithm[1]}" echo "3. ${algorithm[2]}" echo "4. ${algorithm[3]}" echo "5. ${algorithm[4]}" echo "6. ${algorithm[5]}-- cleans up the work area in $WORK_DIR" read -p "Enter your choice : " choicefiecho "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"///alg中保存选择的算法alg=${algorithm[$choice-1]}# Spark specific check and work if [ "x$alg" == "xnaivebayes-Spark" -o "x$alg" == "xcnaivebayes-Spark" ]; then if [ "$MASTER" == "" ] ; then echo "Plese set your MASTER env variable to point to your Spark Master URL. exiting..." exit 1 fi if [ "$MAHOUT_LOCAL" != "" ] ; then echo "Options 3 and 4 can not run in MAHOUT_LOCAL mode. exiting..." exit 1 fifi////如果不是clean操作if [ "x$alg" != "xclean" ]; then echo "creating work directory at ${WORK_DIR}" ////创建工作目录 mkdir -p ${WORK_DIR} ////下载样本文件 if [ ! -e ${WORK_DIR}/20news-bayesinput ]; then if [ ! -e ${WORK_DIR}/20news-bydate ]; then if [ ! -f ${WORK_DIR}/20news-bydate.tar.gz ]; then echo "Downloading 20news-bydate" curl http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz -o ${WORK_DIR}/20news-bydate.tar.gz fi mkdir -p ${WORK_DIR}/20news-bydate echo "Extracting..." cd ${WORK_DIR}/20news-bydate && tar xzf ../20news-bydate.tar.gz && cd .. && cd .. fi fifi#echo $START_PATHcd $START_PATHcd ../..set -e////如果CBayes分类算法if ( [ "x$alg" == "xnaivebayes-MapReduce" ] || [ "x$alg" == "xcnaivebayes-MapReduce" ] || [ "x$alg" == "xnaivebayes-Spark" ] || [ "x$alg" == "xcnaivebayes-Spark" ] ); then c="" if [ "x$alg" == "xcnaivebayes-MapReduce" -o "x$alg" == "xnaivebayes-Spark" ]; then c=" -c" fi set -x echo "Preparing 20newsgroups data" rm -rf ${WORK_DIR}/20news-all mkdir ${WORK_DIR}/20news-all cp -R ${WORK_DIR}/20news-bydate/*/* ${WORK_DIR}/20news-all if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then echo "Copying 20newsgroups data to HDFS" set +e $DFSRM ${WORK_DIR}/20news-all $DFS -mkdir ${WORK_DIR} $DFS -mkdir ${WORK_DIR}/20news-all set -e if [ $HVERSION -eq "1" ] ; then echo "Copying 20newsgroups data to Hadoop 1 HDFS" $DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/20news-all elif [ $HVERSION -eq "2" ] ; then echo "Copying 20newsgroups data to Hadoop 2 HDFS" $DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/ fi fi////1. 对样本文件进行创建Sequence File echo "Creating sequence files from 20newsgroups data" ./bin/mahout seqdirectory \ -i ${WORK_DIR}/20news-all \ -o ${WORK_DIR}/20news-seq -ow////2. 将Sequence File转换为tfidf向量文件,存放于20news-vectors目录下 echo "Converting sequence files to vectors" ./bin/mahout seq2sparse \ -i ${WORK_DIR}/20news-seq \ -o ${WORK_DIR}/20news-vectors -lnorm -nv -wt tfidf////3. 对生成的向量数据集(20news-vectors/tfidf-vectors)创建training向量文件和测试向量文件 echo "Creating training and holdout set with a random 80-20 split of the generated vector dataset" ./bin/mahout split \ -i ${WORK_DIR}/20news-vectors/tfidf-vectors \ --trainingOutput ${WORK_DIR}/20news-train-vectors \ --testOutput ${WORK_DIR}/20news-test-vectors \ --randomSelectionPct 40 --overwrite --sequenceFiles -xm sequential if [ "x$alg" == "xnaivebayes-MapReduce" -o "x$alg" == "xcnaivebayes-MapReduce" ]; then ////4. 训练Naive Bayes模型,输入为训练向量,输出为训练模型 ////labelindex指的是分类名称,共20个,以Sequence File格式保存 echo "Training Naive Bayes model" ./bin/mahout trainnb \ -i ${WORK_DIR}/20news-train-vectors \ -o ${WORK_DIR}/model \ -li ${WORK_DIR}/labelindex \ -ow $c ////5.1 基于训练向量文件进行测试 echo "Self testing on training set" ./bin/mahout testnb \ -i ${WORK_DIR}/20news-train-vectors\ -m ${WORK_DIR}/model \ -l ${WORK_DIR}/labelindex \ -ow -o ${WORK_DIR}/20news-testing $c echo "Testing on holdout set" ////5.2 基于测试向量文件进行测试 ./bin/mahout testnb \ -i ${WORK_DIR}/20news-test-vectors\ -m ${WORK_DIR}/model \ -l ${WORK_DIR}/labelindex \ -ow -o ${WORK_DIR}/20news-testing $c elif [ "x$alg" == "xnaivebayes-Spark" -o "x$alg" == "xcnaivebayes-Spark" ]; then echo "Training Naive Bayes model" ./bin/mahout spark-trainnb \ -i ${WORK_DIR}/20news-train-vectors \ -o ${WORK_DIR}/spark-model $c -ow -ma $MASTER echo "Self testing on training set" ./bin/mahout spark-testnb \ -i ${WORK_DIR}/20news-train-vectors\ -m ${WORK_DIR}/spark-model $c -ma $MASTER echo "Testing on holdout set" ./bin/mahout spark-testnb \ -i ${WORK_DIR}/20news-test-vectors\ -m ${WORK_DIR}/spark-model $c -ma $MASTER fi////如果是线性回归算法elif [ "x$alg" == "xsgd" ]; then if [ ! -e "/tmp/news-group.model" ]; then echo "Training on ${WORK_DIR}/20news-bydate/20news-bydate-train/" ./bin/mahout org.apache.mahout.classifier.sgd.TrainNewsGroups ${WORK_DIR}/20news-bydate/20news-bydate-train/ fi echo "Testing on ${WORK_DIR}/20news-bydate/20news-bydate-test/ with model: /tmp/news-group.model" ./bin/mahout org.apache.mahout.classifier.sgd.TestNewsGroups --input ${WORK_DIR}/20news-bydate/20news-bydate-test/ --model /tmp/news-group.model////如果是clean操作,则删除工作目录elif [ "x$alg" == "xclean" ]; then rm -rf $WORK_DIR rm -rf /tmp/news-group.model $DFSRM $WORK_DIRfi# Remove the work directory#
请勿发布不友善或者负能量的内容。与人为善,比聪明更重要!
技术博客集 - 网站简介:
前后端技术:
后端基于Hyperf2.1框架开发,前端使用Bootstrap可视化布局系统生成
网站主要作用:
1.编程技术分享及讨论交流,内置聊天系统;
2.测试交流框架问题,比如:Hyperf、Laravel、TP、beego;
3.本站数据是基于大数据采集等爬虫技术为基础助力分享知识,如有侵权请发邮件到站长邮箱,站长会尽快处理;
4.站长邮箱:[email protected];
文章归档
文章标签
友情链接