美文网首页
【2019-05-08】map reduce的类型和格式

【2019-05-08】map reduce的类型和格式

作者: BigBigFlower | 来源:发表于2019-05-08 14:10 被阅读0次

(1)map reduce的类型
map: (k1,v1) -> list(k2,v2)
combiner:(k2,list(v2)) ->(k2,v2)
reduce: (k2,list(v2)) -> list(k3,v3)

map reduceAPI的设置类型
streaming的分割符属性 inputformat类的层次结构

有的应用程序可能不希望文件被切分,而是用一个mapper完整处理每一个输入文件。
避免切分用FileInputFormat的具体子类,可以重写isSplitable

// == NonSplittableTextInputFormat
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;

public class NonSplittableTextInputFormat extends TextInputFormat {
  @Override
  protected boolean isSplitable(JobContext context, Path file) {
    return false;
  }
}

把整个文件作为一条记录来处理

// 把整个文件作为一条记录的inputformat
// cc WholeFileInputFormat An InputFormat for reading a whole file as a record
import java.io.IOException;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.*;

//vv WholeFileInputFormat
public class WholeFileInputFormat
    extends FileInputFormat<NullWritable, BytesWritable> {
  
  @Override
  protected boolean isSplitable(JobContext context, Path file) {
    return false;
  }

  @Override
  public RecordReader<NullWritable, BytesWritable> createRecordReader(
      InputSplit split, TaskAttemptContext context) throws IOException,
      InterruptedException {
    WholeFileRecordReader reader = new WholeFileRecordReader();
    reader.initialize(split, context);
    return reader;
  }
}
//^^ WholeFileInputFormat

RecordReader将整个文件作为一条记录处理


// cc WholeFileRecordReader The RecordReader used by WholeFileInputFormat for reading a whole file as a record
import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

//vv WholeFileRecordReader
class WholeFileRecordReader extends RecordReader<NullWritable, BytesWritable> {
  
  private FileSplit fileSplit;
  private Configuration conf;
  private BytesWritable value = new BytesWritable();
  private boolean processed = false;

  @Override
  public void initialize(InputSplit split, TaskAttemptContext context)
      throws IOException, InterruptedException {
    this.fileSplit = (FileSplit) split;
    this.conf = context.getConfiguration();
  }
  
  @Override
  public boolean nextKeyValue() throws IOException, InterruptedException {
    if (!processed) {
      byte[] contents = new byte[(int) fileSplit.getLength()];
      Path file = fileSplit.getPath();
      FileSystem fs = file.getFileSystem(conf);
      FSDataInputStream in = null;
      try {
        in = fs.open(file);
        IOUtils.readFully(in, contents, 0, contents.length);
        value.set(contents, 0, contents.length);
      } finally {
        IOUtils.closeStream(in);
      }
      processed = true;
      return true;
    }
    return false;
  }
  
  @Override
  public NullWritable getCurrentKey() throws IOException, InterruptedException {
    return NullWritable.get();
  }

  @Override
  public BytesWritable getCurrentValue() throws IOException,
      InterruptedException {
    return value;
  }

  @Override
  public float getProgress() throws IOException {
    return processed ? 1.0f : 0.0f;
  }

  @Override
  public void close() throws IOException {
    // do nothing
  }
}
//^^ WholeFileRecordReader

将若干个小文件打包成顺序文件的MapReduce程序

// cc SmallFilesToSequenceFileConverter A MapReduce program for packaging a collection of small files as a single SequenceFile
import java.io.IOException;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

//vv SmallFilesToSequenceFileConverter
public class SmallFilesToSequenceFileConverter extends Configured
    implements Tool {
  
  static class SequenceFileMapper
      extends Mapper<NullWritable, BytesWritable, Text, BytesWritable> {
    
    private Text filenameKey;
    
    @Override
    protected void setup(Context context) throws IOException,
        InterruptedException {
      InputSplit split = context.getInputSplit();
      Path path = ((FileSplit) split).getPath();
      filenameKey = new Text(path.toString());
    }
    
    @Override
    protected void map(NullWritable key, BytesWritable value, Context context)
        throws IOException, InterruptedException {
      context.write(filenameKey, value);
    }
    
  }

  @Override
  public int run(String[] args) throws Exception {
    Job job = JobBuilder.parseInputAndOutput(this, getConf(), args);
    if (job == null) {
      return -1;
    }
    
    job.setInputFormatClass(WholeFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BytesWritable.class);

    job.setMapperClass(SequenceFileMapper.class);

    return job.waitForCompletion(true) ? 0 : 1;
  }
  
  public static void main(String[] args) throws Exception {
    int exitCode = ToolRunner.run(new SmallFilesToSequenceFileConverter(), args);
    System.exit(exitCode);
  }
}
// ^^ SmallFilesToSequenceFileConverter
outputFormat类的层析结构

数据分割
MultipleOutputs类可以将数据分割成多个文件,这些文件的名称源于输出的键和值或任意字符串。

// == PartitionByStationYearUsingMultipleOutputs
// MultipleOutputs按照气象站划分数据
import java.io.IOException;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class PartitionByStationYearUsingMultipleOutputs extends Configured
 implements Tool {
 
 static class StationMapper
   extends Mapper<LongWritable, Text, Text, Text> {
 
   private NcdcRecordParser parser = new NcdcRecordParser();
   
   @Override
   protected void map(LongWritable key, Text value, Context context)
       throws IOException, InterruptedException {
     parser.parse(value);
     context.write(new Text(parser.getStationId()), value);
   }
 }
 
 static class MultipleOutputsReducer
   extends Reducer<Text, Text, NullWritable, Text> {
   
   private MultipleOutputs<NullWritable, Text> multipleOutputs;
   private NcdcRecordParser parser = new NcdcRecordParser();

   @Override
   protected void setup(Context context)
       throws IOException, InterruptedException {
     multipleOutputs = new MultipleOutputs<NullWritable, Text>(context);
   }

// vv PartitionByStationYearUsingMultipleOutputs
   @Override
   protected void reduce(Text key, Iterable<Text> values, Context context)
       throws IOException, InterruptedException {
     for (Text value : values) {
       parser.parse(value);
       String basePath = String.format("%s/%s/part",
           parser.getStationId(), parser.getYear());
       multipleOutputs.write(NullWritable.get(), value, basePath);
//在MultipleOutputs的write方法指定中指定的基本路径相对于输出路径进行解释,因为它可以包含文件路径分隔符(/),创建任意深度的子目录是可能的。
     }
   }
// ^^ PartitionByStationYearUsingMultipleOutputs
   
   @Override
   protected void cleanup(Context context)
       throws IOException, InterruptedException {
     multipleOutputs.close();
   }
 }

 @Override
 public int run(String[] args) throws Exception {
   Job job = JobBuilder.parseInputAndOutput(this, getConf(), args);
   if (job == null) {
     return -1;
   }
   
   job.setMapperClass(StationMapper.class);
   job.setMapOutputKeyClass(Text.class);
   job.setReducerClass(MultipleOutputsReducer.class);
   job.setOutputKeyClass(NullWritable.class);

   return job.waitForCompletion(true) ? 0 : 1;
 }
 public static void main(String[] args) throws Exception {
   int exitCode = ToolRunner.run(new PartitionByStationYearUsingMultipleOutputs(),
       args);
   System.exit(exitCode);
 }
}

相关文章

网友评论

      本文标题:【2019-05-08】map reduce的类型和格式

      本文链接:https://www.haomeiwen.com/subject/akbsoqtx.html