【Avro三】Hadoop MapReduce读写Avro文件-技术博客集

【Avro三】Hadoop MapReduce读写Avro文件
编程技术 / houtizong 发布于 3年前 69

Avro是Doug Cutting(此人绝对是神一般的存在）牵头开发的。开发之初就是围绕着完善Hadoop生态系统的数据处理而开展的（使用Avro作为Hadoop MapReduce需要处理数据序列化和反序列化的场景）,因此Hadoop MapReduce集成Avro也就是自然而然的事情。

这个例子是一个简单的Hadoop MapReduce读取Avro格式的源文件进行计数统计，然后将计算结果作为Avro格式的数据写到目标文件中，主要目的是体会下Hadoop MapReduce操作Avro的基本流程和Avro提供的API

1. Maven依赖

<?xml version="1.0" encoding="UTF-8"?><project xmlns="http://maven.apache.org/POM/4.0.0"         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">    <modelVersion>4.0.0</modelVersion>    <groupId>learn</groupId>    <artifactId>learn.avro</artifactId>    <version>1.0-SNAPSHOT</version>    <dependencies>        <!--avro core-->        <dependency>            <groupId>org.apache.avro</groupId>            <artifactId>avro</artifactId>            <version>1.7.7</version>        </dependency>        <!--avro rpc support-->        <dependency>            <groupId>org.apache.avro</groupId>            <artifactId>avro-ipc</artifactId>            <version>1.7.7</version>        </dependency>        <!--avro utilities for Hadoop MapReduce to process avro files -->        <dependency>            <groupId>org.apache.avro</groupId>            <artifactId>avro-mapred</artifactId>            <version>1.7.7</version>        </dependency>        <!--Avro and Hadoop Map Reduce-->        <dependency>            <groupId>org.apache.hadoop</groupId>            <artifactId>hadoop-core</artifactId>            <version>1.2.1</version>        </dependency>    </dependencies>    <build>        <plugins>            <plugin>                <groupId>org.apache.avro</groupId>                <artifactId>avro-maven-plugin</artifactId>                <version>1.7.7</version>                <executions>                    <execution>                        <phase>generate-sources</phase>                        <goals>                            <goal>schema</goal>                            <goal>protocol</goal>                            <goal>idl-protocol</goal>                        </goals>                        <configuration>                            <sourceDirectory>${project.basedir}/src/main/avro/</sourceDirectory>                            <outputDirectory>${project.basedir}/src/main/java/</outputDirectory>                        </configuration>                    </execution>                </executions>            </plugin>            <plugin>                <groupId>org.apache.maven.plugins</groupId>                <artifactId>maven-compiler-plugin</artifactId>                <configuration>                    <source>1.7</source>                    <target>1.7</target>                </configuration>            </plugin>        </plugins>    </build></project>

2. MapReduce代码：

package examples.avro.mapreduce;import examples.avro.simple.User;import org.apache.avro.Schema;import org.apache.avro.mapred.AvroKey;import org.apache.avro.mapred.AvroValue;import org.apache.avro.mapreduce.AvroJob;import org.apache.avro.mapreduce.AvroKeyInputFormat;import org.apache.avro.mapreduce.AvroKeyValueOutputFormat;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;import java.io.IOException;public class MapReduceColorCount extends Configured implements Tool {    ///Mapper定义：    ///输入Key类型是AvroKey<User>，输入Value类型是NullWritable    ///输出Key类型是Text，输出Value类型是IntWritable    public static class ColorCountMapper extends            Mapper<AvroKey<User>, NullWritable, Text, IntWritable> {        @Override        public void map(AvroKey<User> key, NullWritable value, Context context)                throws IOException, InterruptedException {            CharSequence color = key.datum().getFavoriteColor();            if (color == null) {                color = "none";            }            context.write(new Text(color.toString()), new IntWritable(1));        }    }    ///Reducer定义：    ///输入Key类型是Text，输入Value类型是IntWritable(跟Key的输出Key/Value类型一致)    ///输出Key类型是AvroKey<CharSequence>，输出Value类型是AvroValue<Integer>    public static class ColorCountReducer extends            Reducer<Text, IntWritable, AvroKey<CharSequence>, AvroValue<Integer>> {        @Override        public void reduce(Text key, Iterable<IntWritable> values,                           Context context) throws IOException, InterruptedException {            int sum = 0;            for (IntWritable value : values) {                sum += value.get();            }            context.write(new AvroKey<CharSequence>(key.toString()), new AvroValue<Integer>(sum));        }    }    public int run(String[] args) throws Exception {        if (args.length != 2) {            System.err.println("Usage: MapReduceColorCount <input path> <output path>");            return -1;        }        Job job = new Job(getConf());        job.setJarByClass(MapReduceColorCount.class);        job.setJobName("Color Count");        ///指定输入路径，输入文件是Avro格式        FileInputFormat.setInputPaths(job, new Path(args[0]));        ///指定输出路径，输出文件格式是Key/Value组成的Avro文件，见AvroKeyValueOutputFormat        FileOutputFormat.setOutputPath(job, new Path(args[1]));        //AvroKeyInputFormat: A MapReduce InputFormat that can handle Avro container files.        job.setInputFormatClass(AvroKeyInputFormat.class);        job.setMapperClass(ColorCountMapper.class);        AvroJob.setInputKeySchema(job, User.getClassSchema());        job.setMapOutputKeyClass(Text.class);        job.setMapOutputValueClass(IntWritable.class);        //AvroKeyValueOutputFormat: FileOutputFormat for writing Avro container files of key/value pairs        job.setOutputFormatClass(AvroKeyValueOutputFormat.class);        job.setReducerClass(ColorCountReducer.class);        AvroJob.setOutputKeySchema(job, Schema.create(Schema.Type.STRING));        AvroJob.setOutputValueSchema(job, Schema.create(Schema.Type.INT));        return (job.waitForCompletion(true) ? 0 : 1);    }    public static void main(String[] args) throws Exception {        int res = ToolRunner.run(new MapReduceColorCount(), args);        System.exit(res);    }}

3. 主要类注释

3.1 AvroKey

/** The wrapper of keys for jobs configured with {@link AvroJob} . */

3.2 AvroValue

/** The wrapper of values for jobs configured with {@link AvroJob} . */

3.3 AvroJob

/** Setters to configure jobs for Avro data. */

3.4 AvroKeyInputFormat

/** * A MapReduce InputFormat that can handle Avro container files. * * <p>Keys are AvroKey wrapper objects that contain the Avro data.  Since Avro * container files store only records (not key/value pairs), the value from * this InputFormat is a NullWritable.</p> */

3.5 AvroKeyValueOutputFormat

/** * FileOutputFormat for writing Avro container files of key/value pairs. * * <p>Since Avro container files can only contain records (not key/value pairs), this * output format puts the key and value into an Avro generic record with two fields, named * 'key' and 'value'.</p> * * <p>The keys and values given to this output format may be Avro objects wrapped in * <code>AvroKey</code> or <code>AvroValue</code> objects.  The basic Writable types are * also supported (e.g., IntWritable, Text); they will be converted to their corresponding * Avro types.</p> * * @param <K> The type of key. If an Avro type, it must be wrapped in an <code>AvroKey</code>. * @param <V> The type of value. If an Avro type, it must be wrapped in an <code>AvroValue</code>. */

3.6

  /**   * Sets the job input key schema.   *   * @param job The job to configure.   * @param schema The input key schema.   */  public static void setInputKeySchema(Job job, Schema schema) {    job.getConfiguration().set(CONF_INPUT_KEY_SCHEMA, schema.toString());  }  /**   * Sets the job input value schema.   *   * @param job The job to configure.   * @param schema The input value schema.   */  public static void setInputValueSchema(Job job, Schema schema) {    job.getConfiguration().set(CONF_INPUT_VALUE_SCHEMA, schema.toString());  }

3.7

/**   * Sets the map output key schema.   *   * @param job The job to configure.   * @param schema The map output key schema.   */  public static void setMapOutputKeySchema(Job job, Schema schema) {    job.setMapOutputKeyClass(AvroKey.class);    job.setGroupingComparatorClass(AvroKeyComparator.class);    job.setSortComparatorClass(AvroKeyComparator.class);    AvroSerialization.setKeyWriterSchema(job.getConfiguration(), schema);    AvroSerialization.setKeyReaderSchema(job.getConfiguration(), schema);    AvroSerialization.addToConfiguration(job.getConfiguration());  }  /**   * Sets the map output value schema.   *   * @param job The job to configure.   * @param schema The map output value schema.   */  public static void setMapOutputValueSchema(Job job, Schema schema) {    job.setMapOutputValueClass(AvroValue.class);    AvroSerialization.setValueWriterSchema(job.getConfiguration(), schema);    AvroSerialization.setValueReaderSchema(job.getConfiguration(), schema);    AvroSerialization.addToConfiguration(job.getConfiguration());  }

上一篇：【Hive十一】Hive数据倾斜优化

下一篇：【Avro一】Avro入门

请勿发布不友善或者负能量的内容。与人为善，比聪明更重要！

<div  >  <p>Avro是Doug Cutting(此人绝对是神一般的存在）牵头开发的。 开发之初就是围绕着完善Hadoop生态系统的数据处理而开展的（使用Avro作为Hadoop MapReduce需要处理数据序列化和反序列化的场景）,因此Hadoop MapReduce集成Avro也就是自然而然的事情。</p>  <p>这个例子是一个简单的Hadoop MapReduce读取Avro格式的源文件进行计数统计，然后将计算结果作为Avro格式的数据写到目标文件中，主要目的是体会下Hadoop MapReduce操作Avro的基本流程和Avro提供的API</p>  <p>&nbsp;</p>  <h2>1. Maven依赖</h2>  <pre class="xml" name="code">&lt;?xml version=&quot;1.0&quot; encoding=&quot;UTF-8&quot;?&gt;&lt;project xmlns=&quot;http://maven.apache.org/POM/4.0.0&quot;         xmlns:xsi=&quot;http://www.w3.org/2001/XMLSchema-instance&quot;         xsi:schemaLocation=&quot;http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd&quot;&gt;    &lt;modelVersion&gt;4.0.0&lt;/modelVersion&gt;    &lt;groupId&gt;learn&lt;/groupId&gt;    &lt;artifactId&gt;learn.avro&lt;/artifactId&gt;    &lt;version&gt;1.0-SNAPSHOT&lt;/version&gt;    &lt;dependencies&gt;        &lt;!--avro core--&gt;        &lt;dependency&gt;            &lt;groupId&gt;org.apache.avro&lt;/groupId&gt;            &lt;artifactId&gt;avro&lt;/artifactId&gt;            &lt;version&gt;1.7.7&lt;/version&gt;        &lt;/dependency&gt;        &lt;!--avro rpc support--&gt;        &lt;dependency&gt;            &lt;groupId&gt;org.apache.avro&lt;/groupId&gt;            &lt;artifactId&gt;avro-ipc&lt;/artifactId&gt;            &lt;version&gt;1.7.7&lt;/version&gt;        &lt;/dependency&gt;        &lt;!--avro utilities for Hadoop MapReduce to process avro files --&gt;        &lt;dependency&gt;            &lt;groupId&gt;org.apache.avro&lt;/groupId&gt;            &lt;artifactId&gt;avro-mapred&lt;/artifactId&gt;            &lt;version&gt;1.7.7&lt;/version&gt;        &lt;/dependency&gt;        &lt;!--Avro and Hadoop Map Reduce--&gt;        &lt;dependency&gt;            &lt;groupId&gt;org.apache.hadoop&lt;/groupId&gt;            &lt;artifactId&gt;hadoop-core&lt;/artifactId&gt;            &lt;version&gt;1.2.1&lt;/version&gt;        &lt;/dependency&gt;    &lt;/dependencies&gt;    &lt;build&gt;        &lt;plugins&gt;            &lt;plugin&gt;                &lt;groupId&gt;org.apache.avro&lt;/groupId&gt;                &lt;artifactId&gt;avro-maven-plugin&lt;/artifactId&gt;                &lt;version&gt;1.7.7&lt;/version&gt;                &lt;executions&gt;                    &lt;execution&gt;                        &lt;phase&gt;generate-sources&lt;/phase&gt;                        &lt;goals&gt;                            &lt;goal&gt;schema&lt;/goal&gt;                            &lt;goal&gt;protocol&lt;/goal&gt;                            &lt;goal&gt;idl-protocol&lt;/goal&gt;                        &lt;/goals&gt;                        &lt;configuration&gt;                            &lt;sourceDirectory&gt;${project.basedir}/src/main/avro/&lt;/sourceDirectory&gt;                            &lt;outputDirectory&gt;${project.basedir}/src/main/java/&lt;/outputDirectory&gt;                        &lt;/configuration&gt;                    &lt;/execution&gt;                &lt;/executions&gt;            &lt;/plugin&gt;            &lt;plugin&gt;                &lt;groupId&gt;org.apache.maven.plugins&lt;/groupId&gt;                &lt;artifactId&gt;maven-compiler-plugin&lt;/artifactId&gt;                &lt;configuration&gt;                    &lt;source&gt;1.7&lt;/source&gt;                    &lt;target&gt;1.7&lt;/target&gt;                &lt;/configuration&gt;            &lt;/plugin&gt;        &lt;/plugins&gt;    &lt;/build&gt;&lt;/project&gt;</pre>  <p>&nbsp;</p>  <h2>&nbsp;</h2>  <h2>2. MapReduce代码：</h2>  <pre class="java" name="code">package examples.avro.mapreduce;import examples.avro.simple.User;import org.apache.avro.Schema;import org.apache.avro.mapred.AvroKey;import org.apache.avro.mapred.AvroValue;import org.apache.avro.mapreduce.AvroJob;import org.apache.avro.mapreduce.AvroKeyInputFormat;import org.apache.avro.mapreduce.AvroKeyValueOutputFormat;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;import java.io.IOException;public class MapReduceColorCount extends Configured implements Tool {    ///Mapper定义：    ///输入Key类型是AvroKey&lt;User&gt;，输入Value类型是NullWritable    ///输出Key类型是Text，输出Value类型是IntWritable    public static class ColorCountMapper extends            Mapper&lt;AvroKey&lt;User&gt;, NullWritable, Text, IntWritable&gt; {        @Override        public void map(AvroKey&lt;User&gt; key, NullWritable value, Context context)                throws IOException, InterruptedException {            CharSequence color = key.datum().getFavoriteColor();            if (color == null) {                color = &quot;none&quot;;            }            context.write(new Text(color.toString()), new IntWritable(1));        }    }    ///Reducer定义：    ///输入Key类型是Text，输入Value类型是IntWritable(跟Key的输出Key/Value类型一致)    ///输出Key类型是AvroKey&lt;CharSequence&gt;，输出Value类型是AvroValue&lt;Integer&gt;    public static class ColorCountReducer extends            Reducer&lt;Text, IntWritable, AvroKey&lt;CharSequence&gt;, AvroValue&lt;Integer&gt;&gt; {        @Override        public void reduce(Text key, Iterable&lt;IntWritable&gt; values,                           Context context) throws IOException, InterruptedException {            int sum = 0;            for (IntWritable value : values) {                sum += value.get();            }            context.write(new AvroKey&lt;CharSequence&gt;(key.toString()), new AvroValue&lt;Integer&gt;(sum));        }    }    public int run(String[] args) throws Exception {        if (args.length != 2) {            System.err.println(&quot;Usage: MapReduceColorCount &lt;input path&gt; &lt;output path&gt;&quot;);            return -1;        }        Job job = new Job(getConf());        job.setJarByClass(MapReduceColorCount.class);        job.setJobName(&quot;Color Count&quot;);        ///指定输入路径，输入文件是Avro格式        FileInputFormat.setInputPaths(job, new Path(args[0]));        ///指定输出路径，输出文件格式是Key/Value组成的Avro文件，见AvroKeyValueOutputFormat        FileOutputFormat.setOutputPath(job, new Path(args[1]));        //AvroKeyInputFormat: A MapReduce InputFormat that can handle Avro container files.        job.setInputFormatClass(AvroKeyInputFormat.class);        job.setMapperClass(ColorCountMapper.class);        AvroJob.setInputKeySchema(job, User.getClassSchema());        job.setMapOutputKeyClass(Text.class);        job.setMapOutputValueClass(IntWritable.class);        //AvroKeyValueOutputFormat: FileOutputFormat for writing Avro container files of key/value pairs        job.setOutputFormatClass(AvroKeyValueOutputFormat.class);        job.setReducerClass(ColorCountReducer.class);        AvroJob.setOutputKeySchema(job, Schema.create(Schema.Type.STRING));        AvroJob.setOutputValueSchema(job, Schema.create(Schema.Type.INT));        return (job.waitForCompletion(true) ? 0 : 1);    }    public static void main(String[] args) throws Exception {        int res = ToolRunner.run(new MapReduceColorCount(), args);        System.exit(res);    }}</pre>  <p>&nbsp;</p>  <p>&nbsp;</p>  <h2>3. 主要类注释</h2>  <p>3.1 AvroKey</p>  <pre class="java" name="code">/** The wrapper of keys for jobs configured with {@link AvroJob} . */</pre>  <p>&nbsp;</p>  <p>3.2 AvroValue</p>  <pre class="java" name="code">/** The wrapper of values for jobs configured with {@link AvroJob} . */</pre>  <p>&nbsp;</p>  <p>3.3 AvroJob</p>  <pre class="java" name="code">/** Setters to configure jobs for Avro data. */</pre>  <p>&nbsp;</p>  <p>3.4 AvroKeyInputFormat</p>  <pre class="java" name="code">/** * A MapReduce InputFormat that can handle Avro container files. * * &lt;p&gt;Keys are AvroKey wrapper objects that contain the Avro data.  Since Avro * container files store only records (not key/value pairs), the value from * this InputFormat is a NullWritable.&lt;/p&gt; */</pre>  <p>&nbsp;</p>  <p>3.5 AvroKeyValueOutputFormat</p>  <pre class="java" name="code">/** * FileOutputFormat for writing Avro container files of key/value pairs. * * &lt;p&gt;Since Avro container files can only contain records (not key/value pairs), this * output format puts the key and value into an Avro generic record with two fields, named * 'key' and 'value'.&lt;/p&gt; * * &lt;p&gt;The keys and values given to this output format may be Avro objects wrapped in * &lt;code&gt;AvroKey&lt;/code&gt; or &lt;code&gt;AvroValue&lt;/code&gt; objects.  The basic Writable types are * also supported (e.g., IntWritable, Text); they will be converted to their corresponding * Avro types.&lt;/p&gt; * * @param &lt;K&gt; The type of key. If an Avro type, it must be wrapped in an &lt;code&gt;AvroKey&lt;/code&gt;. * @param &lt;V&gt; The type of value. If an Avro type, it must be wrapped in an &lt;code&gt;AvroValue&lt;/code&gt;. */</pre>  <p>&nbsp;</p>  <p>3.6</p>  <pre class="java" name="code">  /**   * Sets the job input key schema.   *   * @param job The job to configure.   * @param schema The input key schema.   */  public static void setInputKeySchema(Job job, Schema schema) {    job.getConfiguration().set(CONF_INPUT_KEY_SCHEMA, schema.toString());  }  /**   * Sets the job input value schema.   *   * @param job The job to configure.   * @param schema The input value schema.   */  public static void setInputValueSchema(Job job, Schema schema) {    job.getConfiguration().set(CONF_INPUT_VALUE_SCHEMA, schema.toString());  }</pre>  <p>&nbsp;</p>  <p>3.7</p>  <pre class="java" name="code">/**   * Sets the map output key schema.   *   * @param job The job to configure.   * @param schema The map output key schema.   */  public static void setMapOutputKeySchema(Job job, Schema schema) {    job.setMapOutputKeyClass(AvroKey.class);    job.setGroupingComparatorClass(AvroKeyComparator.class);    job.setSortComparatorClass(AvroKeyComparator.class);    AvroSerialization.setKeyWriterSchema(job.getConfiguration(), schema);    AvroSerialization.setKeyReaderSchema(job.getConfiguration(), schema);    AvroSerialization.addToConfiguration(job.getConfiguration());  }  /**   * Sets the map output value schema.   *   * @param job The job to configure.   * @param schema The map output value schema.   */  public static void setMapOutputValueSchema(Job job, Schema schema) {    job.setMapOutputValueClass(AvroValue.class);    AvroSerialization.setValueWriterSchema(job.getConfiguration(), schema);    AvroSerialization.setValueReaderSchema(job.getConfiguration(), schema);    AvroSerialization.addToConfiguration(job.getConfiguration());  }</pre>  <p>&nbsp;</p>  <p>&nbsp;</p>  <p>&nbsp;</p> </div>

留言需要登陆哦

技术博客集 - 网站简介：
前后端技术：
后端基于Hyperf2.1框架开发,前端使用Bootstrap可视化布局系统生成
网站主要作用：
1.编程技术分享及讨论交流，内置聊天系统;
2.测试交流框架问题，比如：Hyperf、Laravel、TP、beego;
3.本站数据是基于大数据采集等爬虫技术为基础助力分享知识，如有侵权请发邮件到站长邮箱，站长会尽快处理;
4.站长邮箱：[email protected];

文章归档

文章标签

友情链接

首页
关于我们

Auther ·HouTiZong: 侯体宗的博客