Skip to content

Commit

Permalink
fix of the partitioner bug for NUTCH-2455
Browse files Browse the repository at this point in the history
  • Loading branch information
okedoki committed Dec 27, 2017
1 parent e20973c commit 16f26f1
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 7 deletions.
5 changes: 2 additions & 3 deletions src/java/org/apache/nutch/crawl/Generator.java
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ public int compareTo(FloatTextPair tp) {
}

}
//The comparator is made to "merge" hostdb data and crawldd data. See NUTCH-2455
//The comparator is made to "merge" hostdb data and crawldb data. See NUTCH-2455
//TODO : Implement RawComparator
public static class ScoreHostKeyComparator extends WritableComparator {
protected ScoreHostKeyComparator() {
Expand Down Expand Up @@ -456,7 +456,6 @@ public void reduce(FloatTextPair key, Iterator<SelectorEntry> values,
OutputCollector<FloatWritable, SelectorEntry> output, Reporter reporter)
throws IOException {

String hostname = null;
LongWritable variableFetchDelayWritable = null; // in millis
Text variableFetchDelayKey = new Text("_variableFetchDelay_");
int maxCount = this.maxCount;
Expand Down Expand Up @@ -832,7 +831,7 @@ public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
job.setOutputKeyComparatorClass(ScoreHostKeyComparator.class);
job.setPartitionerClass(Selector.class);
job.setReducerClass(Selector.class);

FileOutputFormat.setOutputPath(job, tempDir);
job.setOutputFormat(SequenceFileOutputFormat.class);
job.setOutputKeyClass(FloatWritable.class);
Expand Down
15 changes: 11 additions & 4 deletions src/java/org/apache/nutch/crawl/URLPartitioner.java
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ public void close() {

public String getNormalizedURLRoot(Text key){
URLNormalizers normalizers = this.normalizers;
String rootMode = mode;

String urlString = key.toString();
URL url = null;
Expand All @@ -86,11 +85,19 @@ public String getNormalizedURLRoot(Text key){
return urlString;
}


/** Hash by host or domain name or IP address. */
public int getPartition(Text key, Writable value, int numReduceTasks) {
String urlString = key.toString();
URL url = null;
int hashCode = getNormalizedURLRoot(key).hashCode();
String partitionKey;

if(key.toString().isEmpty()){
partitionKey = value.toString();
} else
{
partitionKey = getNormalizedURLRoot(key);
}

int hashCode = partitionKey.hashCode();
// make hosts wind up in different partitions on different runs
hashCode ^= seed;

Expand Down

0 comments on commit 16f26f1

Please sign in to comment.