[jira] [Commented] (NUTCH-2574) Generator: hostCount >= maxCount comparison wrong

Previous Topic Next Topic
 
classic Classic list List threaded Threaded
1 message Options
Reply | Threaded
Open this post in threaded view
|

[jira] [Commented] (NUTCH-2574) Generator: hostCount >= maxCount comparison wrong

JIRA jira@apache.org

    [ https://issues.apache.org/jira/browse/NUTCH-2574?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16510782#comment-16510782 ]

ASF GitHub Bot commented on NUTCH-2574:
---------------------------------------

sebastian-nagel closed pull request #344: NUTCH-2574 Generator: hostCount >= maxCount comparison wrong
URL: https://github.com/apache/nutch/pull/344
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/src/java/org/apache/nutch/crawl/Generator.java b/src/java/org/apache/nutch/crawl/Generator.java
index 9c22ee228..2fa1b460c 100644
--- a/src/java/org/apache/nutch/crawl/Generator.java
+++ b/src/java/org/apache/nutch/crawl/Generator.java
@@ -283,7 +283,7 @@ public void map(Text key, CrawlDatum value,
       private HashMap<String, int[]> hostCounts = new HashMap<>();
       private long count;
       private int currentsegmentnum = 1;
-      private MultipleOutputs mos;
+      private MultipleOutputs<FloatWritable, SelectorEntry> mos;
       private String outputFile;
       private long limit;
       private int segCounts[];
@@ -361,7 +361,7 @@ private JexlContext createContext(HostDatum datum) {
       @Override
       public void setup(Context context) throws IOException {
         conf = context.getConfiguration();
-        mos = new MultipleOutputs(context);
+        mos = new MultipleOutputs<FloatWritable, SelectorEntry>(context);
         Job job = Job.getInstance(conf);
         limit = conf.getLong(GENERATOR_TOP_N, Long.MAX_VALUE)
                      / job.getNumReduceTasks();
@@ -398,7 +398,8 @@ public void reduce(FloatWritable key, Iterable<SelectorEntry> values,
         HostDatum host = null;
         LongWritable variableFetchDelayWritable = null; // in millis
         Text variableFetchDelayKey = new Text("_variableFetchDelay_");
-        int temp_maxCount = maxCount;
+          // local variable maxCount may hold host-specific count set in HostDb
+        int maxCount = this.maxCount;
         for (SelectorEntry entry : values) {
           Text url = entry.url;
           String urlString = url.toString();
@@ -486,12 +487,12 @@ public void reduce(FloatWritable key, Iterable<SelectorEntry> values,
 
             // reached the limit of allowed URLs per host / domain
             // see if we can put it in the next segment?
-            if (hostCount[1] >= maxCount) {
+            if (hostCount[1] > maxCount) {
               if (hostCount[0] < maxNumSegments) {
                 hostCount[0]++;
-                hostCount[1] = 0;
+                hostCount[1] = 1;
               } else {
-                if (hostCount[1] == maxCount + 1 && LOG.isInfoEnabled()) {
+                if (hostCount[1] == maxCount && LOG.isInfoEnabled()) {
                   LOG.info("Host or domain "
                       + hostordomain
                       + " has more than "
@@ -519,7 +520,6 @@ public void reduce(FloatWritable key, Iterable<SelectorEntry> values,
           // maxCount may cause us to skip it.
           count++;
         }
-        maxCount = temp_maxCount;
       }
       
       private String generateFileName(SelectorEntry entry) {
diff --git a/src/test/org/apache/nutch/crawl/TestGenerator.java b/src/test/org/apache/nutch/crawl/TestGenerator.java
index 5ea4da336..9a21146b2 100644
--- a/src/test/org/apache/nutch/crawl/TestGenerator.java
+++ b/src/test/org/apache/nutch/crawl/TestGenerator.java
@@ -145,8 +145,9 @@ public void testGenerateHostLimit() throws Exception {
 
     createCrawlDB(list);
 
+    int maxPerHost = 1;
     Configuration myConfiguration = new Configuration(conf);
-    myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, 2);
+    myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, maxPerHost);
     Path generatedSegment = generateFetchlist(Integer.MAX_VALUE,
         myConfiguration, false);
 
@@ -156,10 +157,13 @@ public void testGenerateHostLimit() throws Exception {
     ArrayList<URLCrawlDatum> fetchList = readContents(fetchlistPath);
 
     // verify we got right amount of records
-    Assert.assertEquals(1, fetchList.size());
+    int expectedFetchListSize = Math.min(maxPerHost, list.size());
+    Assert.assertEquals("Failed to apply generate.max.count by host",
+        expectedFetchListSize, fetchList.size());
 
+    maxPerHost = 2;
     myConfiguration = new Configuration(conf);
-    myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, 3);
+    myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, maxPerHost);
     generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
         false);
 
@@ -169,10 +173,13 @@ public void testGenerateHostLimit() throws Exception {
     fetchList = readContents(fetchlistPath);
 
     // verify we got right amount of records
-    Assert.assertEquals(2, fetchList.size());
+    expectedFetchListSize = Math.min(maxPerHost, list.size());
+    Assert.assertEquals("Failed to apply generate.max.count by host",
+        expectedFetchListSize, fetchList.size());
 
+    maxPerHost = 3;
     myConfiguration = new Configuration(conf);
-    myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, 4);
+    myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, maxPerHost);
     generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
         false);
 
@@ -182,7 +189,9 @@ public void testGenerateHostLimit() throws Exception {
     fetchList = readContents(fetchlistPath);
 
     // verify we got right amount of records
-    Assert.assertEquals(3, fetchList.size());
+    expectedFetchListSize = Math.min(maxPerHost, list.size());
+    Assert.assertEquals("Failed to apply generate.max.count by host",
+        expectedFetchListSize, fetchList.size());
   }
 
   /**
@@ -201,8 +210,9 @@ public void testGenerateDomainLimit() throws Exception {
 
     createCrawlDB(list);
 
+    int maxPerDomain = 1;
     Configuration myConfiguration = new Configuration(conf);
-    myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, 2);
+    myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, maxPerDomain);
     myConfiguration.set(Generator.GENERATOR_COUNT_MODE,
         Generator.GENERATOR_COUNT_VALUE_DOMAIN);
 
@@ -215,10 +225,13 @@ public void testGenerateDomainLimit() throws Exception {
     ArrayList<URLCrawlDatum> fetchList = readContents(fetchlistPath);
 
     // verify we got right amount of records
-    Assert.assertEquals(1, fetchList.size());
+    int expectedFetchListSize = Math.min(maxPerDomain, list.size());
+    Assert.assertEquals("Failed to apply generate.max.count by domain",
+        expectedFetchListSize, fetchList.size());
 
+    maxPerDomain = 2;
     myConfiguration = new Configuration(myConfiguration);
-    myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, 3);
+    myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, maxPerDomain);
     generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
         false);
 
@@ -228,10 +241,13 @@ public void testGenerateDomainLimit() throws Exception {
     fetchList = readContents(fetchlistPath);
 
     // verify we got right amount of records
-    Assert.assertEquals(2, fetchList.size());
+    expectedFetchListSize = Math.min(maxPerDomain, list.size());
+    Assert.assertEquals("Failed to apply generate.max.count by domain",
+        expectedFetchListSize, fetchList.size());
 
+    maxPerDomain = 3;
     myConfiguration = new Configuration(myConfiguration);
-    myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, 4);
+    myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, maxPerDomain);
     generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
         false);
 
@@ -241,7 +257,9 @@ public void testGenerateDomainLimit() throws Exception {
     fetchList = readContents(fetchlistPath);
 
     // verify we got right amount of records
-    Assert.assertEquals(3, fetchList.size());
+    expectedFetchListSize = Math.min(maxPerDomain, list.size());
+    Assert.assertEquals("Failed to apply generate.max.count by domain",
+        expectedFetchListSize, fetchList.size());
   }
 
   /**


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[hidden email]


> Generator: hostCount >= maxCount comparison wrong
> -------------------------------------------------
>
>                 Key: NUTCH-2574
>                 URL: https://issues.apache.org/jira/browse/NUTCH-2574
>             Project: Nutch
>          Issue Type: Bug
>          Components: generator
>    Affects Versions: 1.13
>            Reporter: Michael Coffey
>            Assignee: Sebastian Nagel
>            Priority: Minor
>             Fix For: 1.15
>
>
> In the Generator.Selector.reduce function, there is a comparison of hostCount[1] to maxCount, to determine whether or not to push the current URL to the next segment. The purpose is to honor generate.max.count.
> Sebastian noticed that it should test if (hostCount[1] > maxCount) rather than ">=".  As it stands, the code sometimes puts one less url into a segment than it should.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)