【Mahout二】基于Mahout CBayes算法的20newsgroup的脚本分析

编程技术  /  houtizong 发布于 3年前   109
#!/bin/bash## Licensed to the Apache Software Foundation (ASF) under one or more# contributor license agreements.  See the NOTICE file distributed with# this work for additional information regarding copyright ownership.# The ASF licenses this file to You under the Apache License, Version 2.0# (the "License"); you may not use this file except in compliance with# the License.  You may obtain a copy of the License at##     http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.### Downloads the 20newsgroups dataset, trains and tests a classifier.## To run:  change into the mahout directory and type:# examples/bin/classify-20newsgroups.sh////支持--help以查看该脚本的功能,本脚本是对经典的20 News Groups运行CBayes分类算法if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then  echo "This script runs SGD and Bayes classifiers over the classic 20 News Groups."  exitfiSCRIPT_PATH=${0%/*}if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then  cd $SCRIPT_PATHfiSTART_PATH=`pwd`# Set commands for dfssource ${START_PATH}/set-dfs-commands.sh////Mahout MapReduce 作业的本地工作目录,WORK_DIR=/tmp/mahout-work-${USER}////本脚本支持的分类算法,这里选择cnaivebayes-MapReduce算法algorithm=( cnaivebayes-MapReduce naivebayes-MapReduce cnaivebayes-Spark naivebayes-Spark sgd clean)/////选择分类算法if [ -n "$1" ]; then  choice=$1else  echo "Please select a number to choose the corresponding task to run"  echo "1. ${algorithm[0]}"  echo "2. ${algorithm[1]}"  echo "3. ${algorithm[2]}"  echo "4. ${algorithm[3]}"  echo "5. ${algorithm[4]}"  echo "6. ${algorithm[5]}-- cleans up the work area in $WORK_DIR"  read -p "Enter your choice : " choicefiecho "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"///alg中保存选择的算法alg=${algorithm[$choice-1]}# Spark specific check and work if [ "x$alg" == "xnaivebayes-Spark" -o "x$alg" == "xcnaivebayes-Spark" ]; then  if [ "$MASTER" == "" ] ; then    echo "Plese set your MASTER env variable to point to your Spark Master URL. exiting..."    exit 1  fi  if [ "$MAHOUT_LOCAL" != "" ] ; then    echo "Options 3 and 4 can not run in MAHOUT_LOCAL mode. exiting..."    exit 1  fifi////如果不是clean操作if [ "x$alg" != "xclean" ]; then  echo "creating work directory at ${WORK_DIR}"  ////创建工作目录  mkdir -p ${WORK_DIR}  ////下载样本文件  if [ ! -e ${WORK_DIR}/20news-bayesinput ]; then    if [ ! -e ${WORK_DIR}/20news-bydate ]; then      if [ ! -f ${WORK_DIR}/20news-bydate.tar.gz ]; then        echo "Downloading 20news-bydate"        curl http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz -o ${WORK_DIR}/20news-bydate.tar.gz      fi      mkdir -p ${WORK_DIR}/20news-bydate      echo "Extracting..."      cd ${WORK_DIR}/20news-bydate && tar xzf ../20news-bydate.tar.gz && cd .. && cd ..    fi  fifi#echo $START_PATHcd $START_PATHcd ../..set -e////如果CBayes分类算法if  ( [ "x$alg" == "xnaivebayes-MapReduce" ] ||  [ "x$alg" == "xcnaivebayes-MapReduce" ] || [ "x$alg" == "xnaivebayes-Spark"  ] || [ "x$alg" == "xcnaivebayes-Spark" ] ); then  c=""  if [ "x$alg" == "xcnaivebayes-MapReduce" -o "x$alg" == "xnaivebayes-Spark" ]; then    c=" -c"  fi  set -x  echo "Preparing 20newsgroups data"  rm -rf ${WORK_DIR}/20news-all  mkdir ${WORK_DIR}/20news-all  cp -R ${WORK_DIR}/20news-bydate/*/* ${WORK_DIR}/20news-all  if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then    echo "Copying 20newsgroups data to HDFS"    set +e    $DFSRM ${WORK_DIR}/20news-all    $DFS -mkdir ${WORK_DIR}    $DFS -mkdir ${WORK_DIR}/20news-all    set -e    if [ $HVERSION -eq "1" ] ; then      echo "Copying 20newsgroups data to Hadoop 1 HDFS"      $DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/20news-all    elif [ $HVERSION -eq "2" ] ; then      echo "Copying 20newsgroups data to Hadoop 2 HDFS"      $DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/    fi  fi////1. 对样本文件进行创建Sequence File  echo "Creating sequence files from 20newsgroups data"  ./bin/mahout seqdirectory \    -i ${WORK_DIR}/20news-all \    -o ${WORK_DIR}/20news-seq -ow////2. 将Sequence File转换为tfidf向量文件,存放于20news-vectors目录下  echo "Converting sequence files to vectors"  ./bin/mahout seq2sparse \    -i ${WORK_DIR}/20news-seq \    -o ${WORK_DIR}/20news-vectors  -lnorm -nv  -wt tfidf////3. 对生成的向量数据集(20news-vectors/tfidf-vectors)创建training向量文件和测试向量文件  echo "Creating training and holdout set with a random 80-20 split of the generated vector dataset"  ./bin/mahout split \    -i ${WORK_DIR}/20news-vectors/tfidf-vectors \    --trainingOutput ${WORK_DIR}/20news-train-vectors \    --testOutput ${WORK_DIR}/20news-test-vectors  \    --randomSelectionPct 40 --overwrite --sequenceFiles -xm sequential    if [ "x$alg" == "xnaivebayes-MapReduce"  -o  "x$alg" == "xcnaivebayes-MapReduce" ]; then      ////4. 训练Naive Bayes模型,输入为训练向量,输出为训练模型      ////labelindex指的是分类名称,共20个,以Sequence File格式保存      echo "Training Naive Bayes model"      ./bin/mahout trainnb \        -i ${WORK_DIR}/20news-train-vectors \        -o ${WORK_DIR}/model \        -li ${WORK_DIR}/labelindex \        -ow $c            ////5.1 基于训练向量文件进行测试      echo "Self testing on training set"      ./bin/mahout testnb \        -i ${WORK_DIR}/20news-train-vectors\        -m ${WORK_DIR}/model \        -l ${WORK_DIR}/labelindex \        -ow -o ${WORK_DIR}/20news-testing $c      echo "Testing on holdout set"             ////5.2 基于测试向量文件进行测试      ./bin/mahout testnb \        -i ${WORK_DIR}/20news-test-vectors\        -m ${WORK_DIR}/model \        -l ${WORK_DIR}/labelindex \        -ow -o ${WORK_DIR}/20news-testing $c    elif [ "x$alg" == "xnaivebayes-Spark" -o "x$alg" == "xcnaivebayes-Spark" ]; then      echo "Training Naive Bayes model"      ./bin/mahout spark-trainnb \        -i ${WORK_DIR}/20news-train-vectors \        -o ${WORK_DIR}/spark-model $c -ow -ma $MASTER      echo "Self testing on training set"      ./bin/mahout spark-testnb \        -i ${WORK_DIR}/20news-train-vectors\        -m ${WORK_DIR}/spark-model $c -ma $MASTER      echo "Testing on holdout set"      ./bin/mahout spark-testnb \        -i ${WORK_DIR}/20news-test-vectors\        -m ${WORK_DIR}/spark-model $c -ma $MASTER            fi////如果是线性回归算法elif [ "x$alg" == "xsgd" ]; then  if [ ! -e "/tmp/news-group.model" ]; then    echo "Training on ${WORK_DIR}/20news-bydate/20news-bydate-train/"    ./bin/mahout org.apache.mahout.classifier.sgd.TrainNewsGroups ${WORK_DIR}/20news-bydate/20news-bydate-train/  fi  echo "Testing on ${WORK_DIR}/20news-bydate/20news-bydate-test/ with model: /tmp/news-group.model"  ./bin/mahout org.apache.mahout.classifier.sgd.TestNewsGroups --input ${WORK_DIR}/20news-bydate/20news-bydate-test/ --model /tmp/news-group.model////如果是clean操作,则删除工作目录elif [ "x$alg" == "xclean" ]; then  rm -rf $WORK_DIR  rm -rf /tmp/news-group.model  $DFSRM $WORK_DIRfi# Remove the work directory# 
 

请勿发布不友善或者负能量的内容。与人为善,比聪明更重要!

留言需要登陆哦

技术博客集 - 网站简介:
前后端技术:
后端基于Hyperf2.1框架开发,前端使用Bootstrap可视化布局系统生成

网站主要作用:
1.编程技术分享及讨论交流,内置聊天系统;
2.测试交流框架问题,比如:Hyperf、Laravel、TP、beego;
3.本站数据是基于大数据采集等爬虫技术为基础助力分享知识,如有侵权请发邮件到站长邮箱,站长会尽快处理;
4.站长邮箱:[email protected];

      订阅博客周刊 去订阅

文章归档

文章标签

友情链接

Auther ·HouTiZong
侯体宗的博客
© 2020 zongscan.com
版权所有ICP证 : 粤ICP备20027696号
PHP交流群 也可以扫右边的二维码
侯体宗的博客