【Mahout二】基于Mahout CBayes算法的20newsgroup的脚本分析-技术博客集

【Mahout二】基于Mahout CBayes算法的20newsgroup的脚本分析
编程技术 / houtizong 发布于 3年前 151

#!/bin/bash## Licensed to the Apache Software Foundation (ASF) under one or more# contributor license agreements.  See the NOTICE file distributed with# this work for additional information regarding copyright ownership.# The ASF licenses this file to You under the Apache License, Version 2.0# (the "License"); you may not use this file except in compliance with# the License.  You may obtain a copy of the License at##     http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.### Downloads the 20newsgroups dataset, trains and tests a classifier.## To run:  change into the mahout directory and type:# examples/bin/classify-20newsgroups.sh////支持--help以查看该脚本的功能，本脚本是对经典的20 News Groups运行CBayes分类算法if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then  echo "This script runs SGD and Bayes classifiers over the classic 20 News Groups."  exitfiSCRIPT_PATH=${0%/*}if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then  cd $SCRIPT_PATHfiSTART_PATH=`pwd`# Set commands for dfssource ${START_PATH}/set-dfs-commands.sh////Mahout MapReduce 作业的本地工作目录，WORK_DIR=/tmp/mahout-work-${USER}////本脚本支持的分类算法，这里选择cnaivebayes-MapReduce算法algorithm=( cnaivebayes-MapReduce naivebayes-MapReduce cnaivebayes-Spark naivebayes-Spark sgd clean)/////选择分类算法if [ -n "$1" ]; then  choice=$1else  echo "Please select a number to choose the corresponding task to run"  echo "1. ${algorithm[0]}"  echo "2. ${algorithm[1]}"  echo "3. ${algorithm[2]}"  echo "4. ${algorithm[3]}"  echo "5. ${algorithm[4]}"  echo "6. ${algorithm[5]}-- cleans up the work area in $WORK_DIR"  read -p "Enter your choice : " choicefiecho "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"///alg中保存选择的算法alg=${algorithm[$choice-1]}# Spark specific check and work if [ "x$alg" == "xnaivebayes-Spark" -o "x$alg" == "xcnaivebayes-Spark" ]; then  if [ "$MASTER" == "" ] ; then    echo "Plese set your MASTER env variable to point to your Spark Master URL. exiting..."    exit 1  fi  if [ "$MAHOUT_LOCAL" != "" ] ; then    echo "Options 3 and 4 can not run in MAHOUT_LOCAL mode. exiting..."    exit 1  fifi////如果不是clean操作if [ "x$alg" != "xclean" ]; then  echo "creating work directory at ${WORK_DIR}"  ////创建工作目录  mkdir -p ${WORK_DIR}  ////下载样本文件  if [ ! -e ${WORK_DIR}/20news-bayesinput ]; then    if [ ! -e ${WORK_DIR}/20news-bydate ]; then      if [ ! -f ${WORK_DIR}/20news-bydate.tar.gz ]; then        echo "Downloading 20news-bydate"        curl http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz -o ${WORK_DIR}/20news-bydate.tar.gz      fi      mkdir -p ${WORK_DIR}/20news-bydate      echo "Extracting..."      cd ${WORK_DIR}/20news-bydate && tar xzf ../20news-bydate.tar.gz && cd .. && cd ..    fi  fifi#echo $START_PATHcd $START_PATHcd ../..set -e////如果CBayes分类算法if  ( [ "x$alg" == "xnaivebayes-MapReduce" ] ||  [ "x$alg" == "xcnaivebayes-MapReduce" ] || [ "x$alg" == "xnaivebayes-Spark"  ] || [ "x$alg" == "xcnaivebayes-Spark" ] ); then  c=""  if [ "x$alg" == "xcnaivebayes-MapReduce" -o "x$alg" == "xnaivebayes-Spark" ]; then    c=" -c"  fi  set -x  echo "Preparing 20newsgroups data"  rm -rf ${WORK_DIR}/20news-all  mkdir ${WORK_DIR}/20news-all  cp -R ${WORK_DIR}/20news-bydate/*/* ${WORK_DIR}/20news-all  if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then    echo "Copying 20newsgroups data to HDFS"    set +e    $DFSRM ${WORK_DIR}/20news-all    $DFS -mkdir ${WORK_DIR}    $DFS -mkdir ${WORK_DIR}/20news-all    set -e    if [ $HVERSION -eq "1" ] ; then      echo "Copying 20newsgroups data to Hadoop 1 HDFS"      $DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/20news-all    elif [ $HVERSION -eq "2" ] ; then      echo "Copying 20newsgroups data to Hadoop 2 HDFS"      $DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/    fi  fi////1. 对样本文件进行创建Sequence File  echo "Creating sequence files from 20newsgroups data"  ./bin/mahout seqdirectory \    -i ${WORK_DIR}/20news-all \    -o ${WORK_DIR}/20news-seq -ow////2. 将Sequence File转换为tfidf向量文件，存放于20news-vectors目录下  echo "Converting sequence files to vectors"  ./bin/mahout seq2sparse \    -i ${WORK_DIR}/20news-seq \    -o ${WORK_DIR}/20news-vectors  -lnorm -nv  -wt tfidf////3. 对生成的向量数据集（20news-vectors/tfidf-vectors）创建training向量文件和测试向量文件  echo "Creating training and holdout set with a random 80-20 split of the generated vector dataset"  ./bin/mahout split \    -i ${WORK_DIR}/20news-vectors/tfidf-vectors \    --trainingOutput ${WORK_DIR}/20news-train-vectors \    --testOutput ${WORK_DIR}/20news-test-vectors  \    --randomSelectionPct 40 --overwrite --sequenceFiles -xm sequential    if [ "x$alg" == "xnaivebayes-MapReduce"  -o  "x$alg" == "xcnaivebayes-MapReduce" ]; then      ////4. 训练Naive Bayes模型，输入为训练向量，输出为训练模型      ////labelindex指的是分类名称，共20个，以Sequence File格式保存      echo "Training Naive Bayes model"      ./bin/mahout trainnb \        -i ${WORK_DIR}/20news-train-vectors \        -o ${WORK_DIR}/model \        -li ${WORK_DIR}/labelindex \        -ow $c            ////5.1 基于训练向量文件进行测试      echo "Self testing on training set"      ./bin/mahout testnb \        -i ${WORK_DIR}/20news-train-vectors\        -m ${WORK_DIR}/model \        -l ${WORK_DIR}/labelindex \        -ow -o ${WORK_DIR}/20news-testing $c      echo "Testing on holdout set"             ////5.2 基于测试向量文件进行测试      ./bin/mahout testnb \        -i ${WORK_DIR}/20news-test-vectors\        -m ${WORK_DIR}/model \        -l ${WORK_DIR}/labelindex \        -ow -o ${WORK_DIR}/20news-testing $c    elif [ "x$alg" == "xnaivebayes-Spark" -o "x$alg" == "xcnaivebayes-Spark" ]; then      echo "Training Naive Bayes model"      ./bin/mahout spark-trainnb \        -i ${WORK_DIR}/20news-train-vectors \        -o ${WORK_DIR}/spark-model $c -ow -ma $MASTER      echo "Self testing on training set"      ./bin/mahout spark-testnb \        -i ${WORK_DIR}/20news-train-vectors\        -m ${WORK_DIR}/spark-model $c -ma $MASTER      echo "Testing on holdout set"      ./bin/mahout spark-testnb \        -i ${WORK_DIR}/20news-test-vectors\        -m ${WORK_DIR}/spark-model $c -ma $MASTER            fi////如果是线性回归算法elif [ "x$alg" == "xsgd" ]; then  if [ ! -e "/tmp/news-group.model" ]; then    echo "Training on ${WORK_DIR}/20news-bydate/20news-bydate-train/"    ./bin/mahout org.apache.mahout.classifier.sgd.TrainNewsGroups ${WORK_DIR}/20news-bydate/20news-bydate-train/  fi  echo "Testing on ${WORK_DIR}/20news-bydate/20news-bydate-test/ with model: /tmp/news-group.model"  ./bin/mahout org.apache.mahout.classifier.sgd.TestNewsGroups --input ${WORK_DIR}/20news-bydate/20news-bydate-test/ --model /tmp/news-group.model////如果是clean操作，则删除工作目录elif [ "x$alg" == "xclean" ]; then  rm -rf $WORK_DIR  rm -rf /tmp/news-group.model  $DFSRM $WORK_DIRfi# Remove the work directory#

上一篇：【Mahout一】基于Mahout 命令参数含义

下一篇：【Hadoop十五】Hadoop Counter

请勿发布不友善或者负能量的内容。与人为善，比聪明更重要！

<div  >  <pre class="Bash" name="code">#!/bin/bash## Licensed to the Apache Software Foundation (ASF) under one or more# contributor license agreements.  See the NOTICE file distributed with# this work for additional information regarding copyright ownership.# The ASF licenses this file to You under the Apache License, Version 2.0# (the &quot;License&quot;); you may not use this file except in compliance with# the License.  You may obtain a copy of the License at##     http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an &quot;AS IS&quot; BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.### Downloads the 20newsgroups dataset, trains and tests a classifier.## To run:  change into the mahout directory and type:# examples/bin/classify-20newsgroups.sh////支持--help以查看该脚本的功能，本脚本是对经典的20 News Groups运行CBayes分类算法if [ &quot;$1&quot; = &quot;--help&quot; ] || [ &quot;$1&quot; = &quot;--?&quot; ]; then  echo &quot;This script runs SGD and Bayes classifiers over the classic 20 News Groups.&quot;  exitfiSCRIPT_PATH=${0%/*}if [ &quot;$0&quot; != &quot;$SCRIPT_PATH&quot; ] &amp;&amp; [ &quot;$SCRIPT_PATH&quot; != &quot;&quot; ]; then  cd $SCRIPT_PATHfiSTART_PATH=`pwd`# Set commands for dfssource ${START_PATH}/set-dfs-commands.sh////Mahout MapReduce 作业的本地工作目录，WORK_DIR=/tmp/mahout-work-${USER}////本脚本支持的分类算法，这里选择cnaivebayes-MapReduce算法algorithm=( cnaivebayes-MapReduce naivebayes-MapReduce cnaivebayes-Spark naivebayes-Spark sgd clean)/////选择分类算法if [ -n &quot;$1&quot; ]; then  choice=$1else  echo &quot;Please select a number to choose the corresponding task to run&quot;  echo &quot;1. ${algorithm[0]}&quot;  echo &quot;2. ${algorithm[1]}&quot;  echo &quot;3. ${algorithm[2]}&quot;  echo &quot;4. ${algorithm[3]}&quot;  echo &quot;5. ${algorithm[4]}&quot;  echo &quot;6. ${algorithm[5]}-- cleans up the work area in $WORK_DIR&quot;  read -p &quot;Enter your choice : &quot; choicefiecho &quot;ok. You chose $choice and we'll use ${algorithm[$choice-1]}&quot;///alg中保存选择的算法alg=${algorithm[$choice-1]}# Spark specific check and work if [ &quot;x$alg&quot; == &quot;xnaivebayes-Spark&quot; -o &quot;x$alg&quot; == &quot;xcnaivebayes-Spark&quot; ]; then  if [ &quot;$MASTER&quot; == &quot;&quot; ] ; then    echo &quot;Plese set your MASTER env variable to point to your Spark Master URL. exiting...&quot;    exit 1  fi  if [ &quot;$MAHOUT_LOCAL&quot; != &quot;&quot; ] ; then    echo &quot;Options 3 and 4 can not run in MAHOUT_LOCAL mode. exiting...&quot;    exit 1  fifi////如果不是clean操作if [ &quot;x$alg&quot; != &quot;xclean&quot; ]; then  echo &quot;creating work directory at ${WORK_DIR}&quot;  ////创建工作目录  mkdir -p ${WORK_DIR}  ////下载样本文件  if [ ! -e ${WORK_DIR}/20news-bayesinput ]; then    if [ ! -e ${WORK_DIR}/20news-bydate ]; then      if [ ! -f ${WORK_DIR}/20news-bydate.tar.gz ]; then        echo &quot;Downloading 20news-bydate&quot;        curl http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz -o ${WORK_DIR}/20news-bydate.tar.gz      fi      mkdir -p ${WORK_DIR}/20news-bydate      echo &quot;Extracting...&quot;      cd ${WORK_DIR}/20news-bydate &amp;&amp; tar xzf ../20news-bydate.tar.gz &amp;&amp; cd .. &amp;&amp; cd ..    fi  fifi#echo $START_PATHcd $START_PATHcd ../..set -e////如果CBayes分类算法if  ( [ &quot;x$alg&quot; == &quot;xnaivebayes-MapReduce&quot; ] ||  [ &quot;x$alg&quot; == &quot;xcnaivebayes-MapReduce&quot; ] || [ &quot;x$alg&quot; == &quot;xnaivebayes-Spark&quot;  ] || [ &quot;x$alg&quot; == &quot;xcnaivebayes-Spark&quot; ] ); then  c=&quot;&quot;  if [ &quot;x$alg&quot; == &quot;xcnaivebayes-MapReduce&quot; -o &quot;x$alg&quot; == &quot;xnaivebayes-Spark&quot; ]; then    c=&quot; -c&quot;  fi  set -x  echo &quot;Preparing 20newsgroups data&quot;  rm -rf ${WORK_DIR}/20news-all  mkdir ${WORK_DIR}/20news-all  cp -R ${WORK_DIR}/20news-bydate/*/* ${WORK_DIR}/20news-all  if [ &quot;$HADOOP_HOME&quot; != &quot;&quot; ] &amp;&amp; [ &quot;$MAHOUT_LOCAL&quot; == &quot;&quot; ] ; then    echo &quot;Copying 20newsgroups data to HDFS&quot;    set +e    $DFSRM ${WORK_DIR}/20news-all    $DFS -mkdir ${WORK_DIR}    $DFS -mkdir ${WORK_DIR}/20news-all    set -e    if [ $HVERSION -eq &quot;1&quot; ] ; then      echo &quot;Copying 20newsgroups data to Hadoop 1 HDFS&quot;      $DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/20news-all    elif [ $HVERSION -eq &quot;2&quot; ] ; then      echo &quot;Copying 20newsgroups data to Hadoop 2 HDFS&quot;      $DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/    fi  fi////1. 对样本文件进行创建Sequence File  echo &quot;Creating sequence files from 20newsgroups data&quot;  ./bin/mahout seqdirectory \    -i ${WORK_DIR}/20news-all \    -o ${WORK_DIR}/20news-seq -ow////2. 将Sequence File转换为tfidf向量文件，存放于20news-vectors目录下  echo &quot;Converting sequence files to vectors&quot;  ./bin/mahout seq2sparse \    -i ${WORK_DIR}/20news-seq \    -o ${WORK_DIR}/20news-vectors  -lnorm -nv  -wt tfidf////3. 对生成的向量数据集（20news-vectors/tfidf-vectors）创建training向量文件和测试向量文件  echo &quot;Creating training and holdout set with a random 80-20 split of the generated vector dataset&quot;  ./bin/mahout split \    -i ${WORK_DIR}/20news-vectors/tfidf-vectors \    --trainingOutput ${WORK_DIR}/20news-train-vectors \    --testOutput ${WORK_DIR}/20news-test-vectors  \    --randomSelectionPct 40 --overwrite --sequenceFiles -xm sequential    if [ &quot;x$alg&quot; == &quot;xnaivebayes-MapReduce&quot;  -o  &quot;x$alg&quot; == &quot;xcnaivebayes-MapReduce&quot; ]; then      ////4. 训练Naive Bayes模型，输入为训练向量，输出为训练模型      ////labelindex指的是分类名称，共20个，以Sequence File格式保存      echo &quot;Training Naive Bayes model&quot;      ./bin/mahout trainnb \        -i ${WORK_DIR}/20news-train-vectors \        -o ${WORK_DIR}/model \        -li ${WORK_DIR}/labelindex \        -ow $c            ////5.1 基于训练向量文件进行测试      echo &quot;Self testing on training set&quot;      ./bin/mahout testnb \        -i ${WORK_DIR}/20news-train-vectors\        -m ${WORK_DIR}/model \        -l ${WORK_DIR}/labelindex \        -ow -o ${WORK_DIR}/20news-testing $c      echo &quot;Testing on holdout set&quot;             ////5.2 基于测试向量文件进行测试      ./bin/mahout testnb \        -i ${WORK_DIR}/20news-test-vectors\        -m ${WORK_DIR}/model \        -l ${WORK_DIR}/labelindex \        -ow -o ${WORK_DIR}/20news-testing $c    elif [ &quot;x$alg&quot; == &quot;xnaivebayes-Spark&quot; -o &quot;x$alg&quot; == &quot;xcnaivebayes-Spark&quot; ]; then      echo &quot;Training Naive Bayes model&quot;      ./bin/mahout spark-trainnb \        -i ${WORK_DIR}/20news-train-vectors \        -o ${WORK_DIR}/spark-model $c -ow -ma $MASTER      echo &quot;Self testing on training set&quot;      ./bin/mahout spark-testnb \        -i ${WORK_DIR}/20news-train-vectors\        -m ${WORK_DIR}/spark-model $c -ma $MASTER      echo &quot;Testing on holdout set&quot;      ./bin/mahout spark-testnb \        -i ${WORK_DIR}/20news-test-vectors\        -m ${WORK_DIR}/spark-model $c -ma $MASTER            fi////如果是线性回归算法elif [ &quot;x$alg&quot; == &quot;xsgd&quot; ]; then  if [ ! -e &quot;/tmp/news-group.model&quot; ]; then    echo &quot;Training on ${WORK_DIR}/20news-bydate/20news-bydate-train/&quot;    ./bin/mahout org.apache.mahout.classifier.sgd.TrainNewsGroups ${WORK_DIR}/20news-bydate/20news-bydate-train/  fi  echo &quot;Testing on ${WORK_DIR}/20news-bydate/20news-bydate-test/ with model: /tmp/news-group.model&quot;  ./bin/mahout org.apache.mahout.classifier.sgd.TestNewsGroups --input ${WORK_DIR}/20news-bydate/20news-bydate-test/ --model /tmp/news-group.model////如果是clean操作，则删除工作目录elif [ &quot;x$alg&quot; == &quot;xclean&quot; ]; then  rm -rf $WORK_DIR  rm -rf /tmp/news-group.model  $DFSRM $WORK_DIRfi# Remove the work directory# </pre> &nbsp;</div>

留言需要登陆哦

技术博客集 - 网站简介：
前后端技术：
后端基于Hyperf2.1框架开发,前端使用Bootstrap可视化布局系统生成
网站主要作用：
1.编程技术分享及讨论交流，内置聊天系统;
2.测试交流框架问题，比如：Hyperf、Laravel、TP、beego;
3.本站数据是基于大数据采集等爬虫技术为基础助力分享知识，如有侵权请发邮件到站长邮箱，站长会尽快处理;
4.站长邮箱：[email protected];

文章归档

文章标签

友情链接

首页
关于我们

Auther ·HouTiZong: 侯体宗的博客