FileInputFormat.addInputPath(job, new Path(segment, ParseText.DIR_NAME));中仅处理了Segment文件夹下“parse_data”与“parse_text”中的内容,对于原始的网页快照文件夹(“content”)并没有进行处理,Segment文件夹结构如下所示:

我们在这里要做的就是将“content”文件夹加入处理方法中,修改后的方法如下:

public static void initMRJob(Path crawlDb, Path linkDb,

                           Collection<Path> segments,

                           JobConf job) {

     

    final String DIR_CACHE = "content";

 

    LOG.info("IndexerMapReduce: crawldb: " + crawlDb);

    LOG.info("IndexerMapReduce: linkdb: " + linkDb);

 

    for (final Path segment : segments) {

      LOG.info("IndexerMapReduces: adding segment: " + segment);

      FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.FETCH_DIR_NAME));

      FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.PARSE_DIR_NAME));

      FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME));

      FileInputFormat.addInputPath(job, new Path(segment, ParseText.DIR_NAME));

 

      FileInputFormat.addInputPath(job, new Path(segment, DIR_CACHE));

    }

 

    FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));

    FileInputFormat.addInputPath(job, new Path(linkDb, LinkDb.CURRENT_NAME));

    job.setInputFormat(SequenceFileInputFormat.class);

 

    job.setMapperClass(IndexerMapReduce.class);

    job.setReducerClass(IndexerMapReduce.class);

 

    job.setOutputFormat(IndexerOutputFormat.class);

    job.setOutputKeyClass(Text.class);

    job.setMapOutputValueClass(NutchWritable.class);

    job.setOutputValueClass(NutchWritable.class);

  }