fetcher hangs and thead lifetime

classic Classic list List threaded Threaded
8 messages Options
Reply | Threaded
Open this post in threaded view
|

fetcher hangs and thead lifetime

Jon Shoberg

   Is there a way to set the lifetime of a fetching thread?  As in if it
can not complete the entire fetching process in X minutes to gracefully
give up?

   Anyone else experience the fetcher hanging for a long period of time
(hour+)?  I'm using 100 threads, 30 per host.  I'm guessing that I have
one host which it is "stuck" on.

-j

*** Uptime and CPU *****************************************************

[jon@crawlr~]$ uptime
  00:25:14 up 4 days,  5:24,  4 users,  load average: 10.94, 10.82, 10.07

*** Thread Dump ********************************************************

         Full thread dump Java HotSpot(TM) 64-Bit Server VM
(1.5.0_04-b05 mixed mode):

"fetcher124" prio=1 tid=0x00002aabb5504cf0 nid=0x541e runnable
[0x0000000048a42000..0x0000000048a42b30]
         at java.lang.String.<init>(String.java:208)
         at java.lang.StringBuffer.toString(StringBuffer.java:586)
         - locked <0x00002aab226bdaf8> (a java.lang.StringBuffer)
         at org.apache.xerces.dom.CharacterDataImpl.appendData(Unknown
Source)
         at
org.cyberneko.html.parsers.DOMFragmentParser.characters(DOMFragmentParser.java:463)
         at
org.cyberneko.html.filters.DefaultFilter.characters(DefaultFilter.java:195)
         at
org.cyberneko.html.HTMLTagBalancer.characters(HTMLTagBalancer.java:821)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scanCharacters(HTMLScanner.java:1972)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scan(HTMLScanner.java:1775)
         at
org.cyberneko.html.HTMLScanner.scanDocument(HTMLScanner.java:789)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:478)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:431)
         at
org.cyberneko.html.parsers.DOMFragmentParser.parse(DOMFragmentParser.java:164)
         at
org.apache.nutch.parse.html.HtmlParser.parseNeko(HtmlParser.java:249)
         at
org.apache.nutch.parse.html.HtmlParser.parse(HtmlParser.java:213)
         at
org.apache.nutch.parse.html.HtmlParser.getParse(HtmlParser.java:156)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.handleFetch(Fetcher.java:254)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.run(Fetcher.java:148)

"fetcher117" prio=1 tid=0x00002aabb4a08e00 nid=0x5415 runnable
[0x000000004833b000..0x000000004833beb0]
         at java.lang.String.<init>(String.java:208)
         at java.lang.StringBuffer.toString(StringBuffer.java:586)
         - locked <0x00002aab226bdb40> (a java.lang.StringBuffer)
         at org.apache.xerces.dom.CharacterDataImpl.appendData(Unknown
Source)
         at
org.cyberneko.html.parsers.DOMFragmentParser.characters(DOMFragmentParser.java:463)
         at
org.cyberneko.html.filters.DefaultFilter.characters(DefaultFilter.java:195)
         at
org.cyberneko.html.HTMLTagBalancer.characters(HTMLTagBalancer.java:821)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scanCharacters(HTMLScanner.java:1972)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scan(HTMLScanner.java:1775)
         at
org.cyberneko.html.HTMLScanner.scanDocument(HTMLScanner.java:789)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:478)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:431)
         at
org.cyberneko.html.parsers.DOMFragmentParser.parse(DOMFragmentParser.java:164)
         at
org.apache.nutch.parse.html.HtmlParser.parseNeko(HtmlParser.java:249)
         at
org.apache.nutch.parse.html.HtmlParser.parse(HtmlParser.java:213)
         at
org.apache.nutch.parse.html.HtmlParser.getParse(HtmlParser.java:156)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.handleFetch(Fetcher.java:254)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.run(Fetcher.java:148)

"fetcher112" prio=1 tid=0x00002aabb4a03810 nid=0x5410 runnable
[0x0000000047e36000..0x0000000047e36d30]
         at
java.lang.AbstractStringBuilder.expandCapacity(AbstractStringBuilder.java:99)
         at
java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:393)
         at java.lang.StringBuffer.append(StringBuffer.java:225)
         - locked <0x00002aab880a56d0> (a java.lang.StringBuffer)
         at org.apache.xerces.dom.CharacterDataImpl.appendData(Unknown
Source)
         at
org.cyberneko.html.parsers.DOMFragmentParser.characters(DOMFragmentParser.java:463)
         at
org.cyberneko.html.filters.DefaultFilter.characters(DefaultFilter.java:195)
         at
org.cyberneko.html.HTMLTagBalancer.characters(HTMLTagBalancer.java:821)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scanCharacters(HTMLScanner.java:1972)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scan(HTMLScanner.java:1775)
         at
org.cyberneko.html.HTMLScanner.scanDocument(HTMLScanner.java:789)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:478)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:431)
         at
org.cyberneko.html.parsers.DOMFragmentParser.parse(DOMFragmentParser.java:164)
         at
org.apache.nutch.parse.html.HtmlParser.parseNeko(HtmlParser.java:249)
         at
org.apache.nutch.parse.html.HtmlParser.parse(HtmlParser.java:213)
         at
org.apache.nutch.parse.html.HtmlParser.getParse(HtmlParser.java:156)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.handleFetch(Fetcher.java:254)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.run(Fetcher.java:148)

"fetcher106" prio=1 tid=0x00002aabb4d7d720 nid=0x540a runnable
[0x0000000047830000..0x0000000047830c30]
         at
java.lang.AbstractStringBuilder.expandCapacity(AbstractStringBuilder.java:99)
         at
java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:393)
         at java.lang.StringBuffer.append(StringBuffer.java:225)
         - locked <0x00002aab1f412728> (a java.lang.StringBuffer)
         at org.apache.xerces.dom.CharacterDataImpl.appendData(Unknown
Source)
         at
org.cyberneko.html.parsers.DOMFragmentParser.characters(DOMFragmentParser.java:463)
         at
org.cyberneko.html.filters.DefaultFilter.characters(DefaultFilter.java:195)
         at
org.cyberneko.html.HTMLTagBalancer.characters(HTMLTagBalancer.java:821)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scanCharacters(HTMLScanner.java:1972)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scan(HTMLScanner.java:1775)
         at
org.cyberneko.html.HTMLScanner.scanDocument(HTMLScanner.java:789)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:478)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:431)
         at
org.cyberneko.html.parsers.DOMFragmentParser.parse(DOMFragmentParser.java:164)
         at
org.apache.nutch.parse.html.HtmlParser.parseNeko(HtmlParser.java:249)
         at
org.apache.nutch.parse.html.HtmlParser.parse(HtmlParser.java:213)
         at
org.apache.nutch.parse.html.HtmlParser.getParse(HtmlParser.java:156)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.handleFetch(Fetcher.java:254)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.run(Fetcher.java:148)

"fetcher101" prio=1 tid=0x00002aabb4242b30 nid=0x5405 runnable
[0x000000004732b000..0x000000004732beb0]
         at java.lang.String.<init>(String.java:208)
         at java.lang.StringBuffer.toString(StringBuffer.java:586)
         - locked <0x00002aaba26d80c8> (a java.lang.StringBuffer)
         at org.apache.xerces.dom.CharacterDataImpl.appendData(Unknown
Source)
         at
org.cyberneko.html.parsers.DOMFragmentParser.characters(DOMFragmentParser.java:463)
         at
org.cyberneko.html.filters.DefaultFilter.characters(DefaultFilter.java:195)
         at
org.cyberneko.html.HTMLTagBalancer.characters(HTMLTagBalancer.java:821)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scanCharacters(HTMLScanner.java:1972)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scan(HTMLScanner.java:1775)
         at
org.cyberneko.html.HTMLScanner.scanDocument(HTMLScanner.java:789)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:478)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:431)
         at
org.cyberneko.html.parsers.DOMFragmentParser.parse(DOMFragmentParser.java:164)
         at
org.apache.nutch.parse.html.HtmlParser.parseNeko(HtmlParser.java:249)
         at
org.apache.nutch.parse.html.HtmlParser.parse(HtmlParser.java:213)
         at
org.apache.nutch.parse.html.HtmlParser.getParse(HtmlParser.java:156)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.handleFetch(Fetcher.java:254)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.run(Fetcher.java:148)

"fetcher100" prio=1 tid=0x00002aabb4241a00 nid=0x5404 runnable
[0x000000004722a000..0x000000004722ab30]
         at
java.lang.AbstractStringBuilder.expandCapacity(AbstractStringBuilder.java:99)
         at
java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:393)
         at java.lang.StringBuffer.append(StringBuffer.java:225)
         - locked <0x00002aab8d3359d0> (a java.lang.StringBuffer)
         at org.apache.xerces.dom.CharacterDataImpl.appendData(Unknown
Source)
         at
org.cyberneko.html.parsers.DOMFragmentParser.characters(DOMFragmentParser.java:463)
         at
org.cyberneko.html.filters.DefaultFilter.characters(DefaultFilter.java:195)
         at
org.cyberneko.html.HTMLTagBalancer.characters(HTMLTagBalancer.java:821)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scanCharacters(HTMLScanner.java:1972)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scan(HTMLScanner.java:1775)
         at
org.cyberneko.html.HTMLScanner.scanDocument(HTMLScanner.java:789)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:478)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:431)
         at
org.cyberneko.html.parsers.DOMFragmentParser.parse(DOMFragmentParser.java:164)
         at
org.apache.nutch.parse.html.HtmlParser.parseNeko(HtmlParser.java:249)
         at
org.apache.nutch.parse.html.HtmlParser.parse(HtmlParser.java:213)
         at
org.apache.nutch.parse.html.HtmlParser.getParse(HtmlParser.java:156)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.handleFetch(Fetcher.java:254)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.run(Fetcher.java:148)

"fetcher97" prio=1 tid=0x00002aabb423e670 nid=0x5401 runnable
[0x0000000046f27000..0x0000000046f27cb0]
         at
java.lang.AbstractStringBuilder.expandCapacity(AbstractStringBuilder.java:99)
         at
java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:393)
         at java.lang.StringBuffer.append(StringBuffer.java:225)
         - locked <0x00002aab9057f610> (a java.lang.StringBuffer)
         at org.apache.xerces.dom.CharacterDataImpl.appendData(Unknown
Source)
         at
org.cyberneko.html.parsers.DOMFragmentParser.characters(DOMFragmentParser.java:463)
         at
org.cyberneko.html.filters.DefaultFilter.characters(DefaultFilter.java:195)
         at
org.cyberneko.html.HTMLTagBalancer.characters(HTMLTagBalancer.java:821)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scanCharacters(HTMLScanner.java:1972)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scan(HTMLScanner.java:1775)
         at
org.cyberneko.html.HTMLScanner.scanDocument(HTMLScanner.java:789)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:478)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:431)
         at
org.cyberneko.html.parsers.DOMFragmentParser.parse(DOMFragmentParser.java:164)
         at
org.apache.nutch.parse.html.HtmlParser.parseNeko(HtmlParser.java:249)
         at
org.apache.nutch.parse.html.HtmlParser.parse(HtmlParser.java:213)
         at
org.apache.nutch.parse.html.HtmlParser.getParse(HtmlParser.java:156)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.handleFetch(Fetcher.java:254)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.run(Fetcher.java:148)

"fetcher96" prio=1 tid=0x00002aabb423d540 nid=0x5400 runnable
[0x0000000046e26000..0x0000000046e26d30]
         at
java.lang.AbstractStringBuilder.expandCapacity(AbstractStringBuilder.java:99)
         at
java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:393)
         at java.lang.StringBuffer.append(StringBuffer.java:225)
         - locked <0x00002aab226bdcd0> (a java.lang.StringBuffer)
         at org.apache.xerces.dom.CharacterDataImpl.appendData(Unknown
Source)
         at
org.cyberneko.html.parsers.DOMFragmentParser.characters(DOMFragmentParser.java:463)
         at
org.cyberneko.html.filters.DefaultFilter.characters(DefaultFilter.java:195)
         at
org.cyberneko.html.HTMLTagBalancer.characters(HTMLTagBalancer.java:821)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scanCharacters(HTMLScanner.java:1972)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scan(HTMLScanner.java:1775)
         at
org.cyberneko.html.HTMLScanner.scanDocument(HTMLScanner.java:789)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:478)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:431)
         at
org.cyberneko.html.parsers.DOMFragmentParser.parse(DOMFragmentParser.java:164)
         at
org.apache.nutch.parse.html.HtmlParser.parseNeko(HtmlParser.java:249)
         at
org.apache.nutch.parse.html.HtmlParser.parse(HtmlParser.java:213)
         at
org.apache.nutch.parse.html.HtmlParser.getParse(HtmlParser.java:156)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.handleFetch(Fetcher.java:254)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.run(Fetcher.java:148)

"fetcher94" prio=1 tid=0x00002aabb42ea7a0 nid=0x53fe runnable
[0x0000000046c24000..0x0000000046c24e30]
         at
java.lang.AbstractStringBuilder.expandCapacity(AbstractStringBuilder.java:99)
         at
java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:393)
         at java.lang.StringBuffer.append(StringBuffer.java:225)
         - locked <0x00002aab98eeeaf0> (a java.lang.StringBuffer)
         at org.apache.xerces.dom.CharacterDataImpl.appendData(Unknown
Source)
         at
org.cyberneko.html.parsers.DOMFragmentParser.characters(DOMFragmentParser.java:463)
         at
org.cyberneko.html.filters.DefaultFilter.characters(DefaultFilter.java:195)
         at
org.cyberneko.html.HTMLTagBalancer.characters(HTMLTagBalancer.java:821)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scanCharacters(HTMLScanner.java:1972)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scan(HTMLScanner.java:1775)
         at
org.cyberneko.html.HTMLScanner.scanDocument(HTMLScanner.java:789)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:478)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:431)
         at
org.cyberneko.html.parsers.DOMFragmentParser.parse(DOMFragmentParser.java:164)
         at
org.apache.nutch.parse.html.HtmlParser.parseNeko(HtmlParser.java:249)
         at
org.apache.nutch.parse.html.HtmlParser.parse(HtmlParser.java:213)
         at
org.apache.nutch.parse.html.HtmlParser.getParse(HtmlParser.java:156)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.handleFetch(Fetcher.java:254)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.run(Fetcher.java:148)

"fetcher91" prio=1 tid=0x00002aabb42e7410 nid=0x53fb runnable
[0x0000000046921000..0x0000000046921bb0]
         at
java.lang.AbstractStringBuilder.expandCapacity(AbstractStringBuilder.java:99)
         at
java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:393)
         at java.lang.StringBuffer.append(StringBuffer.java:225)
         - locked <0x00002aab226bdd68> (a java.lang.StringBuffer)
         at org.apache.xerces.dom.CharacterDataImpl.appendData(Unknown
Source)
         at
org.cyberneko.html.parsers.DOMFragmentParser.characters(DOMFragmentParser.java:463)
         at
org.cyberneko.html.filters.DefaultFilter.characters(DefaultFilter.java:195)
         at
org.cyberneko.html.HTMLTagBalancer.characters(HTMLTagBalancer.java:821)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scanCharacters(HTMLScanner.java:1972)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scan(HTMLScanner.java:1775)
         at
org.cyberneko.html.HTMLScanner.scanDocument(HTMLScanner.java:789)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:478)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:431)
         at
org.cyberneko.html.parsers.DOMFragmentParser.parse(DOMFragmentParser.java:164)
         at
org.apache.nutch.parse.html.HtmlParser.parseNeko(HtmlParser.java:249)
         at
org.apache.nutch.parse.html.HtmlParser.parse(HtmlParser.java:213)
         at
org.apache.nutch.parse.html.HtmlParser.getParse(HtmlParser.java:156)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.handleFetch(Fetcher.java:254)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.run(Fetcher.java:148)

"fetcher81" prio=1 tid=0x00002aabb4a379c0 nid=0x53f1 runnable
[0x0000000045f17000..0x0000000045f17cb0]
         at
java.lang.AbstractStringBuilder.expandCapacity(AbstractStringBuilder.java:99)
         at
java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:393)
         at java.lang.StringBuffer.append(StringBuffer.java:225)
         - locked <0x00002aab8fd6e840> (a java.lang.StringBuffer)
         at org.apache.xerces.dom.CharacterDataImpl.appendData(Unknown
Source)
         at
org.cyberneko.html.parsers.DOMFragmentParser.characters(DOMFragmentParser.java:463)
         at
org.cyberneko.html.filters.DefaultFilter.characters(DefaultFilter.java:195)
         at
org.cyberneko.html.HTMLTagBalancer.characters(HTMLTagBalancer.java:821)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scanCharacters(HTMLScanner.java:1972)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scan(HTMLScanner.java:1775)
         at
org.cyberneko.html.HTMLScanner.scanDocument(HTMLScanner.java:789)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:478)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:431)
         at
org.cyberneko.html.parsers.DOMFragmentParser.parse(DOMFragmentParser.java:164)
         at
org.apache.nutch.parse.html.HtmlParser.parseNeko(HtmlParser.java:249)
         at
org.apache.nutch.parse.html.HtmlParser.parse(HtmlParser.java:213)
         at
org.apache.nutch.parse.html.HtmlParser.getParse(HtmlParser.java:156)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.handleFetch(Fetcher.java:254)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.run(Fetcher.java:148)

"fetcher80" prio=1 tid=0x00002aabb4a36890 nid=0x53f0 runnable
[0x0000000045e16000..0x0000000045e16d30]
         at
java.lang.AbstractStringBuilder.expandCapacity(AbstractStringBuilder.java:99)
         at
java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:393)
         at java.lang.StringBuffer.append(StringBuffer.java:225)
         - locked <0x00002aaba26d0168> (a java.lang.StringBuffer)
         at org.apache.xerces.dom.CharacterDataImpl.appendData(Unknown
Source)
         at
org.cyberneko.html.parsers.DOMFragmentParser.characters(DOMFragmentParser.java:463)
         at
org.cyberneko.html.filters.DefaultFilter.characters(DefaultFilter.java:195)
         at
org.cyberneko.html.HTMLTagBalancer.characters(HTMLTagBalancer.java:821)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scanCharacters(HTMLScanner.java:1972)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scan(HTMLScanner.java:1775)
         at
org.cyberneko.html.HTMLScanner.scanDocument(HTMLScanner.java:789)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:478)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:431)
         at
org.cyberneko.html.parsers.DOMFragmentParser.parse(DOMFragmentParser.java:164)
         at
org.apache.nutch.parse.html.HtmlParser.parseNeko(HtmlParser.java:249)
         at
org.apache.nutch.parse.html.HtmlParser.parse(HtmlParser.java:213)
         at
org.apache.nutch.parse.html.HtmlParser.getParse(HtmlParser.java:156)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.handleFetch(Fetcher.java:254)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.run(Fetcher.java:148)

"fetcher79" prio=1 tid=0x00002aabb4a35760 nid=0x53ef runnable
[0x0000000045d15000..0x0000000045d15db0]
         at
java.lang.AbstractStringBuilder.expandCapacity(AbstractStringBuilder.java:99)
         at
java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:393)
         at java.lang.StringBuffer.append(StringBuffer.java:225)
         - locked <0x00002aab941bbfe8> (a java.lang.StringBuffer)
         at org.apache.xerces.dom.CharacterDataImpl.appendData(Unknown
Source)
         at
org.cyberneko.html.parsers.DOMFragmentParser.characters(DOMFragmentParser.java:463)
         at
org.cyberneko.html.filters.DefaultFilter.characters(DefaultFilter.java:195)
         at
org.cyberneko.html.HTMLTagBalancer.characters(HTMLTagBalancer.java:821)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scanCharacters(HTMLScanner.java:1972)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scan(HTMLScanner.java:1775)
         at
org.cyberneko.html.HTMLScanner.scanDocument(HTMLScanner.java:789)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:478)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:431)
         at
org.cyberneko.html.parsers.DOMFragmentParser.parse(DOMFragmentParser.java:164)
         at
org.apache.nutch.parse.html.HtmlParser.parseNeko(HtmlParser.java:249)
         at
org.apache.nutch.parse.html.HtmlParser.parse(HtmlParser.java:213)
         at
org.apache.nutch.parse.html.HtmlParser.getParse(HtmlParser.java:156)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.handleFetch(Fetcher.java:254)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.run(Fetcher.java:148)

"fetcher73" prio=1 tid=0x00002aabb5179040 nid=0x53e9 runnable
[0x000000004570f000..0x000000004570fcb0]
         at
org.apache.xerces.dom.CharacterDataImpl.setNodeValueInternal(Unknown Source)
         at org.apache.xerces.dom.CharacterDataImpl.setNodeValue(Unknown
Source)
         at org.apache.xerces.dom.CharacterDataImpl.appendData(Unknown
Source)
         at
org.cyberneko.html.parsers.DOMFragmentParser.characters(DOMFragmentParser.java:463)
         at
org.cyberneko.html.filters.DefaultFilter.characters(DefaultFilter.java:195)
         at
org.cyberneko.html.HTMLTagBalancer.characters(HTMLTagBalancer.java:821)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scanCharacters(HTMLScanner.java:1972)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scan(HTMLScanner.java:1775)
         at
org.cyberneko.html.HTMLScanner.scanDocument(HTMLScanner.java:789)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:478)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:431)
         at
org.cyberneko.html.parsers.DOMFragmentParser.parse(DOMFragmentParser.java:164)
         at
org.apache.nutch.parse.html.HtmlParser.parseNeko(HtmlParser.java:249)
         at
org.apache.nutch.parse.html.HtmlParser.parse(HtmlParser.java:213)
         at
org.apache.nutch.parse.html.HtmlParser.getParse(HtmlParser.java:156)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.handleFetch(Fetcher.java:254)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.run(Fetcher.java:148)

"fetcher66" prio=1 tid=0x00002aabb518a3f0 nid=0x53e2 runnable
[0x0000000045008000..0x0000000045008c30]
         at java.lang.String.<init>(String.java:208)
         at java.lang.StringBuffer.toString(StringBuffer.java:586)
         - locked <0x00002aab85b2b0a8> (a java.lang.StringBuffer)
         at org.apache.xerces.dom.CharacterDataImpl.appendData(Unknown
Source)
         at
org.cyberneko.html.parsers.DOMFragmentParser.characters(DOMFragmentParser.java:463)
         at
org.cyberneko.html.filters.DefaultFilter.characters(DefaultFilter.java:195)
         at
org.cyberneko.html.HTMLTagBalancer.characters(HTMLTagBalancer.java:821)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scanCharacters(HTMLScanner.java:1972)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scan(HTMLScanner.java:1775)
         at
org.cyberneko.html.HTMLScanner.scanDocument(HTMLScanner.java:789)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:478)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:431)
         at
org.cyberneko.html.parsers.DOMFragmentParser.parse(DOMFragmentParser.java:164)
         at
org.apache.nutch.parse.html.HtmlParser.parseNeko(HtmlParser.java:249)
         at
org.apache.nutch.parse.html.HtmlParser.parse(HtmlParser.java:213)
         at
org.apache.nutch.parse.html.HtmlParser.getParse(HtmlParser.java:156)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.handleFetch(Fetcher.java:254)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.run(Fetcher.java:148)

"fetcher65" prio=1 tid=0x00002aabb56d2660 nid=0x53e1 runnable
[0x0000000044f07000..0x0000000044f07cb0]
         at
java.lang.AbstractStringBuilder.expandCapacity(AbstractStringBuilder.java:99)
         at
java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:393)
         at java.lang.StringBuffer.append(StringBuffer.java:225)
         - locked <0x00002aaba26d0240> (a java.lang.StringBuffer)
         at org.apache.xerces.dom.CharacterDataImpl.appendData(Unknown
Source)
         at
org.cyberneko.html.parsers.DOMFragmentParser.characters(DOMFragmentParser.java:463)
         at
org.cyberneko.html.filters.DefaultFilter.characters(DefaultFilter.java:195)
         at
org.cyberneko.html.HTMLTagBalancer.characters(HTMLTagBalancer.java:821)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scanCharacters(HTMLScanner.java:1972)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scan(HTMLScanner.java:1775)
         at
org.cyberneko.html.HTMLScanner.scanDocument(HTMLScanner.java:789)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:478)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:431)
         at
org.cyberneko.html.parsers.DOMFragmentParser.parse(DOMFragmentParser.java:164)
         at
org.apache.nutch.parse.html.HtmlParser.parseNeko(HtmlParser.java:249)
         at
org.apache.nutch.parse.html.HtmlParser.parse(HtmlParser.java:213)
         at
org.apache.nutch.parse.html.HtmlParser.getParse(HtmlParser.java:156)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.handleFetch(Fetcher.java:254)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.run(Fetcher.java:148)

"fetcher61" prio=1 tid=0x00002aabb0d5b7c0 nid=0x53dd runnable
[0x0000000044b03000..0x0000000044b03eb0]
         at java.lang.String.<init>(String.java:208)
         at java.lang.StringBuffer.toString(StringBuffer.java:586)
         - locked <0x00002aab9cc4a970> (a java.lang.StringBuffer)
         at org.apache.xerces.dom.CharacterDataImpl.appendData(Unknown
Source)
         at
org.cyberneko.html.parsers.DOMFragmentParser.characters(DOMFragmentParser.java:463)
         at
org.cyberneko.html.filters.DefaultFilter.characters(DefaultFilter.java:195)
         at
org.cyberneko.html.HTMLTagBalancer.characters(HTMLTagBalancer.java:821)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scanCharacters(HTMLScanner.java:1972)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scan(HTMLScanner.java:1775)
         at
org.cyberneko.html.HTMLScanner.scanDocument(HTMLScanner.java:789)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:478)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:431)
         at
org.cyberneko.html.parsers.DOMFragmentParser.parse(DOMFragmentParser.java:164)
         at
org.apache.nutch.parse.html.HtmlParser.parseNeko(HtmlParser.java:249)
         at
org.apache.nutch.parse.html.HtmlParser.parse(HtmlParser.java:213)
         at
org.apache.nutch.parse.html.HtmlParser.getParse(HtmlParser.java:156)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.handleFetch(Fetcher.java:254)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.run(Fetcher.java:148)

"fetcher54" prio=1 tid=0x00002aabb0d44ad0 nid=0x53d6 runnable
[0x00000000443fc000..0x00000000443fce30]
         at
java.lang.AbstractStringBuilder.expandCapacity(AbstractStringBuilder.java:99)
         at
java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:393)
         at java.lang.StringBuffer.append(StringBuffer.java:225)
         - locked <0x00002aab226bbd40> (a java.lang.StringBuffer)
         at org.apache.xerces.dom.CharacterDataImpl.appendData(Unknown
Source)
         at
org.cyberneko.html.parsers.DOMFragmentParser.characters(DOMFragmentParser.java:463)
         at
org.cyberneko.html.filters.DefaultFilter.characters(DefaultFilter.java:195)
         at
org.cyberneko.html.HTMLTagBalancer.characters(HTMLTagBalancer.java:821)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scanCharacters(HTMLScanner.java:1972)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scan(HTMLScanner.java:1775)
         at
org.cyberneko.html.HTMLScanner.scanDocument(HTMLScanner.java:789)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:478)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:431)
         at
org.cyberneko.html.parsers.DOMFragmentParser.parse(DOMFragmentParser.java:164)
         at
org.apache.nutch.parse.html.HtmlParser.parseNeko(HtmlParser.java:249)
         at
org.apache.nutch.parse.html.HtmlParser.parse(HtmlParser.java:213)
         at
org.apache.nutch.parse.html.HtmlParser.getParse(HtmlParser.java:156)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.handleFetch(Fetcher.java:254)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.run(Fetcher.java:148)

"fetcher51" prio=1 tid=0x00002aabb4243760 nid=0x53d3 runnable
[0x00000000440f9000..0x00000000440f9bb0]
         at
java.lang.AbstractStringBuilder.expandCapacity(AbstractStringBuilder.java:99)
         at
java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:393)
         at java.lang.StringBuffer.append(StringBuffer.java:225)
         - locked <0x00002aab923fdee0> (a java.lang.StringBuffer)
         at org.apache.xerces.dom.CharacterDataImpl.appendData(Unknown
Source)
         at
org.cyberneko.html.parsers.DOMFragmentParser.characters(DOMFragmentParser.java:463)
         at
org.cyberneko.html.filters.DefaultFilter.characters(DefaultFilter.java:195)
         at
org.cyberneko.html.HTMLTagBalancer.characters(HTMLTagBalancer.java:821)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scanCharacters(HTMLScanner.java:1972)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scan(HTMLScanner.java:1775)
         at
org.cyberneko.html.HTMLScanner.scanDocument(HTMLScanner.java:789)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:478)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:431)
         at
org.cyberneko.html.parsers.DOMFragmentParser.parse(DOMFragmentParser.java:164)
         at
org.apache.nutch.parse.html.HtmlParser.parseNeko(HtmlParser.java:249)
         at
org.apache.nutch.parse.html.HtmlParser.parse(HtmlParser.java:213)
         at
org.apache.nutch.parse.html.HtmlParser.getParse(HtmlParser.java:156)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.handleFetch(Fetcher.java:254)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.run(Fetcher.java:148)

"fetcher44" prio=1 tid=0x00002aabb421ce60 nid=0x53cc runnable
[0x00000000439f2000..0x00000000439f2b30]
         at java.lang.String.<init>(String.java:208)
         at java.lang.StringBuffer.toString(StringBuffer.java:586)
         - locked <0x00002aab8eb471c8> (a java.lang.StringBuffer)
         at org.apache.xerces.dom.CharacterDataImpl.appendData(Unknown
Source)
         at
org.cyberneko.html.parsers.DOMFragmentParser.characters(DOMFragmentParser.java:463)
         at
org.cyberneko.html.filters.DefaultFilter.characters(DefaultFilter.java:195)
         at
org.cyberneko.html.HTMLTagBalancer.characters(HTMLTagBalancer.java:821)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scanCharacters(HTMLScanner.java:1972)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scan(HTMLScanner.java:1775)
         at
org.cyberneko.html.HTMLScanner.scanDocument(HTMLScanner.java:789)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:478)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:431)
         at
org.cyberneko.html.parsers.DOMFragmentParser.parse(DOMFragmentParser.java:164)
         at
org.apache.nutch.parse.html.HtmlParser.parseNeko(HtmlParser.java:249)
         at
org.apache.nutch.parse.html.HtmlParser.parse(HtmlParser.java:213)
         at
org.apache.nutch.parse.html.HtmlParser.getParse(HtmlParser.java:156)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.handleFetch(Fetcher.java:254)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.run(Fetcher.java:148)

"fetcher42" prio=1 tid=0x00002aabb4a12a10 nid=0x53ca runnable
[0x00000000437f0000..0x00000000437f0c30]
         at
java.lang.AbstractStringBuilder.expandCapacity(AbstractStringBuilder.java:99)
         at
java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:393)
         at java.lang.StringBuffer.append(StringBuffer.java:225)
         - locked <0x00002aaba26d8300> (a java.lang.StringBuffer)
         at org.apache.xerces.dom.CharacterDataImpl.appendData(Unknown
Source)
         at
org.cyberneko.html.parsers.DOMFragmentParser.characters(DOMFragmentParser.java:463)
         at
org.cyberneko.html.filters.DefaultFilter.characters(DefaultFilter.java:195)
         at
org.cyberneko.html.HTMLTagBalancer.characters(HTMLTagBalancer.java:821)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scanCharacters(HTMLScanner.java:1972)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scan(HTMLScanner.java:1775)
         at
org.cyberneko.html.HTMLScanner.scanDocument(HTMLScanner.java:789)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:478)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:431)
         at
org.cyberneko.html.parsers.DOMFragmentParser.parse(DOMFragmentParser.java:164)
         at
org.apache.nutch.parse.html.HtmlParser.parseNeko(HtmlParser.java:249)
         at
org.apache.nutch.parse.html.HtmlParser.parse(HtmlParser.java:213)
         at
org.apache.nutch.parse.html.HtmlParser.getParse(HtmlParser.java:156)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.handleFetch(Fetcher.java:254)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.run(Fetcher.java:148)

"fetcher38" prio=1 tid=0x00002aabb4c618d0 nid=0x53c6 runnable
[0x00000000433ec000..0x00000000433ece30]
         at
java.lang.AbstractStringBuilder.expandCapacity(AbstractStringBuilder.java:99)
         at
java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:393)
         at java.lang.StringBuffer.append(StringBuffer.java:225)
         - locked <0x00002aab9c521590> (a java.lang.StringBuffer)
         at org.apache.xerces.dom.CharacterDataImpl.appendData(Unknown
Source)
         at
org.cyberneko.html.parsers.DOMFragmentParser.characters(DOMFragmentParser.java:463)
         at
org.cyberneko.html.filters.DefaultFilter.characters(DefaultFilter.java:195)
         at
org.cyberneko.html.HTMLTagBalancer.characters(HTMLTagBalancer.java:821)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scanCharacters(HTMLScanner.java:1972)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scan(HTMLScanner.java:1775)
         at
org.cyberneko.html.HTMLScanner.scanDocument(HTMLScanner.java:789)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:478)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:431)
         at
org.cyberneko.html.parsers.DOMFragmentParser.parse(DOMFragmentParser.java:164)
         at
org.apache.nutch.parse.html.HtmlParser.parseNeko(HtmlParser.java:249)
         at
org.apache.nutch.parse.html.HtmlParser.parse(HtmlParser.java:213)
         at
org.apache.nutch.parse.html.HtmlParser.getParse(HtmlParser.java:156)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.handleFetch(Fetcher.java:254)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.run(Fetcher.java:148)

"fetcher31" prio=1 tid=0x00002aabb46dfb30 nid=0x53bf runnable
[0x0000000042ce5000..0x0000000042ce5db0]
         at
java.lang.AbstractStringBuilder.expandCapacity(AbstractStringBuilder.java:99)
         at
java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:393)
         at java.lang.StringBuffer.append(StringBuffer.java:225)
         - locked <0x00002aab1f414868> (a java.lang.StringBuffer)
         at org.apache.xerces.dom.CharacterDataImpl.appendData(Unknown
Source)
         at
org.cyberneko.html.parsers.DOMFragmentParser.characters(DOMFragmentParser.java:463)
         at
org.cyberneko.html.filters.DefaultFilter.characters(DefaultFilter.java:195)
         at
org.cyberneko.html.HTMLTagBalancer.characters(HTMLTagBalancer.java:821)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scanCharacters(HTMLScanner.java:1972)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scan(HTMLScanner.java:1775)
         at
org.cyberneko.html.HTMLScanner.scanDocument(HTMLScanner.java:789)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:478)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:431)
         at
org.cyberneko.html.parsers.DOMFragmentParser.parse(DOMFragmentParser.java:164)
         at
org.apache.nutch.parse.html.HtmlParser.parseNeko(HtmlParser.java:249)
         at
org.apache.nutch.parse.html.HtmlParser.parse(HtmlParser.java:213)
         at
org.apache.nutch.parse.html.HtmlParser.getParse(HtmlParser.java:156)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.handleFetch(Fetcher.java:254)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.run(Fetcher.java:148)

"fetcher30" prio=1 tid=0x00002aabb46d6710 nid=0x53be runnable
[0x0000000042be4000..0x0000000042be4e30]
         at
java.lang.AbstractStringBuilder.expandCapacity(AbstractStringBuilder.java:99)
         at
java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:393)
         at java.lang.StringBuffer.append(StringBuffer.java:225)
         - locked <0x00002aab226bbe68> (a java.lang.StringBuffer)
         at org.apache.xerces.dom.CharacterDataImpl.appendData(Unknown
Source)
         at
org.cyberneko.html.parsers.DOMFragmentParser.characters(DOMFragmentParser.java:463)
         at
org.cyberneko.html.filters.DefaultFilter.characters(DefaultFilter.java:195)
         at
org.cyberneko.html.HTMLTagBalancer.characters(HTMLTagBalancer.java:821)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scanCharacters(HTMLScanner.java:1972)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scan(HTMLScanner.java:1775)
         at
org.cyberneko.html.HTMLScanner.scanDocument(HTMLScanner.java:789)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:478)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:431)
         at
org.cyberneko.html.parsers.DOMFragmentParser.parse(DOMFragmentParser.java:164)
         at
org.apache.nutch.parse.html.HtmlParser.parseNeko(HtmlParser.java:249)
         at
org.apache.nutch.parse.html.HtmlParser.parse(HtmlParser.java:213)
         at
org.apache.nutch.parse.html.HtmlParser.getParse(HtmlParser.java:156)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.handleFetch(Fetcher.java:254)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.run(Fetcher.java:148)

"fetcher21" prio=1 tid=0x00002aabb4667910 nid=0x53b5 runnable
[0x00000000422db000..0x00000000422dbeb0]
         at
java.lang.AbstractStringBuilder.expandCapacity(AbstractStringBuilder.java:99)
         at
java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:393)
         at java.lang.StringBuffer.append(StringBuffer.java:225)
         - locked <0x00002aaba26d04e0> (a java.lang.StringBuffer)
         at org.apache.xerces.dom.CharacterDataImpl.appendData(Unknown
Source)
         at
org.cyberneko.html.parsers.DOMFragmentParser.characters(DOMFragmentParser.java:463)
         at
org.cyberneko.html.filters.DefaultFilter.characters(DefaultFilter.java:195)
         at
org.cyberneko.html.HTMLTagBalancer.characters(HTMLTagBalancer.java:821)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scanCharacters(HTMLScanner.java:1972)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scan(HTMLScanner.java:1775)
         at
org.cyberneko.html.HTMLScanner.scanDocument(HTMLScanner.java:789)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:478)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:431)
         at
org.cyberneko.html.parsers.DOMFragmentParser.parse(DOMFragmentParser.java:164)
         at
org.apache.nutch.parse.html.HtmlParser.parseNeko(HtmlParser.java:249)
         at
org.apache.nutch.parse.html.HtmlParser.parse(HtmlParser.java:213)
         at
org.apache.nutch.parse.html.HtmlParser.getParse(HtmlParser.java:156)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.handleFetch(Fetcher.java:254)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.run(Fetcher.java:148)

"fetcher9" prio=1 tid=0x00002aabb4e18bb0 nid=0x53a9 runnable
[0x00000000416cf000..0x00000000416cfcb0]
         at
java.lang.AbstractStringBuilder.expandCapacity(AbstractStringBuilder.java:99)
         at
java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:393)
         at java.lang.StringBuffer.append(StringBuffer.java:225)
         - locked <0x00002aab90bf8db8> (a java.lang.StringBuffer)
         at org.apache.xerces.dom.CharacterDataImpl.appendData(Unknown
Source)
         at
org.cyberneko.html.parsers.DOMFragmentParser.characters(DOMFragmentParser.java:463)
         at
org.cyberneko.html.filters.DefaultFilter.characters(DefaultFilter.java:195)
         at
org.cyberneko.html.HTMLTagBalancer.characters(HTMLTagBalancer.java:821)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scanCharacters(HTMLScanner.java:1972)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scan(HTMLScanner.java:1775)
         at
org.cyberneko.html.HTMLScanner.scanDocument(HTMLScanner.java:789)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:478)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:431)
         at
org.cyberneko.html.parsers.DOMFragmentParser.parse(DOMFragmentParser.java:164)
         at
org.apache.nutch.parse.html.HtmlParser.parseNeko(HtmlParser.java:249)
         at
org.apache.nutch.parse.html.HtmlParser.parse(HtmlParser.java:213)
         at
org.apache.nutch.parse.html.HtmlParser.getParse(HtmlParser.java:156)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.handleFetch(Fetcher.java:254)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.run(Fetcher.java:148)

"fetcher2" prio=1 tid=0x00002aabb4919c40 nid=0x53a2 runnable
[0x0000000040fc8000..0x0000000040fc8c30]
         at
java.lang.AbstractStringBuilder.expandCapacity(AbstractStringBuilder.java:99)
         at
java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:393)
         at java.lang.StringBuffer.append(StringBuffer.java:225)
         - locked <0x00002aaba26d8428> (a java.lang.StringBuffer)
         at org.apache.xerces.dom.CharacterDataImpl.appendData(Unknown
Source)
         at
org.cyberneko.html.parsers.DOMFragmentParser.characters(DOMFragmentParser.java:463)
         at
org.cyberneko.html.filters.DefaultFilter.characters(DefaultFilter.java:195)
         at
org.cyberneko.html.HTMLTagBalancer.characters(HTMLTagBalancer.java:821)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scanCharacters(HTMLScanner.java:1972)
         at
org.cyberneko.html.HTMLScanner$ContentScanner.scan(HTMLScanner.java:1775)
         at
org.cyberneko.html.HTMLScanner.scanDocument(HTMLScanner.java:789)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:478)
         at
org.cyberneko.html.HTMLConfiguration.parse(HTMLConfiguration.java:431)
         at
org.cyberneko.html.parsers.DOMFragmentParser.parse(DOMFragmentParser.java:164)
         at
org.apache.nutch.parse.html.HtmlParser.parseNeko(HtmlParser.java:249)
         at
org.apache.nutch.parse.html.HtmlParser.parse(HtmlParser.java:213)
         at
org.apache.nutch.parse.html.HtmlParser.getParse(HtmlParser.java:156)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.handleFetch(Fetcher.java:254)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.run(Fetcher.java:148)

"Low Memory Detector" daemon prio=1 tid=0x00002aabb3d03bd0 nid=0x539e
runnable [0x0000000000000000..0x0000000000000000]

"CompilerThread1" daemon prio=1 tid=0x00002aabb3d02110 nid=0x539d
waiting on condition [0x0000000000000000..0x0000000040ac27d0]

"CompilerThread0" daemon prio=1 tid=0x00002aabb3d00d10 nid=0x539c
waiting on condition [0x0000000000000000..0x00000000409c1450]

"AdapterThread" daemon prio=1 tid=0x00002aabb0cd8d30 nid=0x539b waiting
on condition [0x0000000000000000..0x0000000000000000]

"Signal Dispatcher" daemon prio=1 tid=0x00002aabb0cd7990 nid=0x539a
runnable [0x0000000000000000..0x0000000000000000]

"Finalizer" daemon prio=1 tid=0x00002aabb0cc5480 nid=0x5399 in
Object.wait() [0x00000000406bf000..0x00000000406bfcb0]
         at java.lang.Object.wait(Native Method)
         - waiting on <0x00002aaaf4b9c3b0> (a
java.lang.ref.ReferenceQueue$Lock)
         at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:116)
         - locked <0x00002aaaf4b9c3b0> (a java.lang.ref.ReferenceQueue$Lock)
         at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:132)
         at java.lang.ref.Finalizer$FinalizerThread.run(Finalizer.java:159)

"Reference Handler" daemon prio=1 tid=0x00002aabb0cc22e0 nid=0x5398 in
Object.wait() [0x00000000405be000..0x00000000405bed30]
         at java.lang.Object.wait(Native Method)
         - waiting on <0x00002aaaf4bc37d0> (a java.lang.ref.Reference$Lock)
         at java.lang.Object.wait(Object.java:474)
         at java.lang.ref.Reference$ReferenceHandler.run(Reference.java:116)
         - locked <0x00002aaaf4bc37d0> (a java.lang.ref.Reference$Lock)

"main" prio=1 tid=0x0000000040115bc0 nid=0x5390 waiting on condition
[0x00007fffffa88000..0x00007fffffa88830]
         at java.lang.Thread.sleep(Native Method)
         at org.apache.nutch.fetcher.Fetcher.run(Fetcher.java:351)
         at org.apache.nutch.fetcher.Fetcher.main(Fetcher.java:488)

"VM Thread" prio=1 tid=0x00000000401bab30 nid=0x5397 runnable

"GC task thread#0 (ParallelGC)" prio=1 tid=0x00000000401a85c0 nid=0x5395
runnable

"GC task thread#1 (ParallelGC)" prio=1 tid=0x00000000401a8c60 nid=0x5396
runnable

"VM Periodic Task Thread" prio=1 tid=0x00002aabb3d05980 nid=0x539f
waiting on condition
Reply | Threaded
Open this post in threaded view
|

Re: fetcher hangs and thead lifetime

Paul van Brouwershaven
Helle Jon,

I have the same problem here, the fetcher get stuck aftyher running a few
hours.

How do you get a good crawler if you everytime must repair the database
and start again?

Jon Shoberg wrote:
>
>   Is there a way to set the lifetime of a fetching thread?  As in if it
> can not complete the entire fetching process in X minutes to gracefully
> give up?
>
>   Anyone else experience the fetcher hanging for a long period of time
> (hour+)?  I'm using 100 threads, 30 per host.  I'm guessing that I have
> one host which it is "stuck" on.

Reply | Threaded
Open this post in threaded view
|

Map Reduce

Gal Nitzan
Hi,

Can someone please refer me to some info on map reduce, or describe it a
little?

Thanks,

Gal
Reply | Threaded
Open this post in threaded view
|

Re: Map Reduce

Jack.Tang
Hi Gal

You can get the orignal paper from google labs
              http://labs.google.com/papers/mapreduce.html
and some presentations in nutch wiki
              http://wiki.apache.org/nutch/Presentations

Hope these resources help.

Regards
/Jack

On 9/27/05, Gal Nitzan <[hidden email]> wrote:
> Hi,
>
> Can someone please refer me to some info on map reduce, or describe it a
> little?
>
> Thanks,
>
> Gal
>


--
Keep Discovering ... ...
http://www.jroller.com/page/jmars
Reply | Threaded
Open this post in threaded view
|

RE: Map Reduce

Goldschmidt, Dave
In reply to this post by Gal Nitzan
Hello, MapReduce is described on Nutch's Wiki:

  http://wiki.apache.org/nutch/Presentations

Specifically:

 
http://wiki.apache.org/nutch-data/attachments/Presentations/attachments/
mapred.pdf

Hope this helps,
DaveG


-----Original Message-----
From: Gal Nitzan [mailto:[hidden email]]
Sent: Tuesday, September 27, 2005 7:09 AM
To: [hidden email]
Subject: Map Reduce

Hi,

Can someone please refer me to some info on map reduce, or describe it a

little?

Thanks,

Gal
Reply | Threaded
Open this post in threaded view
|

Re: fetcher hangs and thead lifetime

Jon Shoberg
In reply to this post by Paul van Brouwershaven
 > Jon Shoberg wrote:
 >>
 >>   Is there a way to set the lifetime of a fetching thread?  As in if
 >> it can not complete the entire fetching process in X minutes to
 >> gracefully give up?
 >>
 >>   Anyone else experience the fetcher hanging for a long period of time
 >> (hour+)?  I'm using 100 threads, 30 per host.  I'm guessing that I
 >> have one host which it is "stuck" on.

 > Paul van Brouwershaven wrote:
> Helle Jon,
>
> I have the same problem here, the fetcher get stuck aftyher running a
> few hours.
>
> How do you get a good crawler if you everytime must repair the database
> and start again?
>

I run on stable hardware so I run everything within a screen process
which allows me to interactivly watch whats going on.  I'm in the
testing phases of a nutch implementation so I pay close attention to it.

My request to experienced nutch users / developers:

The wiki has good info.  It would be helful to hear about people's
small, medium, and large implementations.  What configurations are used?
What tweaks to the conf files? What are performance bottle necks?
Common implementation problems and how to fix.  How have you allowed for
dynamic URLs (question marks)?

I'd be willing to aggregate input to wiki entries.

For myself, I'm running a crawling script inside a SCREEN process. This
allows me to SSH in and see whats going on at the console and gracesully
exit the session.  If I don't like a crawling session I'll CTRL-C it and
let the script keep going.

The perl script generates a segment with -numFetchers and starts calling
the fetcher via a system call.

-j

Reply | Threaded
Open this post in threaded view
|

Re: fetcher hangs and thead lifetime

Jeff Pettenski
In reply to this post by Jon Shoberg
Check
http://www.mail-archive.com/nutch-developers@.../msg04491.html

There is a problem with the PDFBox in nutch 0.7. It may hang.

I D/L the 0.7.2 version from http://www.pdfbox.org/, replaced the jar file
in the plug-in directory and updated the plugin.xml to refer to the new jar
file. Seems to be running better. Not done running my test yet.

-j.p.
Reply | Threaded
Open this post in threaded view
|

Re: fetcher hangs and thead lifetime

Jon Shoberg
Jeff Pettenski wrote:

> Check
> http://www.mail-archive.com/nutch-developers@.../msg04491.html
>
> There is a problem with the PDFBox in nutch 0.7. It may hang.
>
> I D/L the 0.7.2 version from http://www.pdfbox.org/, replaced the jar file
> in the plug-in directory and updated the plugin.xml to refer to the new jar
> file. Seems to be running better. Not done running my test yet.
>
> -j.p.
>

Not quite ... Im running the pdfbox patched version.  The past five
minutes I've been looking at this ... when my script should be moving to
the next fetch.  I'm guessing that "fetcher 17" is caught on something.

--

Full thread dump Java HotSpot(TM) 64-Bit Server VM (1.5.0_04-b05 mixed
mode):

"MultiThreadedHttpConnectionManager cleanup" daemon prio=1
tid=0x00002aabb3dfaf20 nid=0x3e99 in Object.wait()
[0x000000004722a000..0x000000004722adb0]
         at java.lang.Object.wait(Native Method)
         - waiting on <0x00002aaaf51f1c50> (a
java.lang.ref.ReferenceQueue$Lock)
         at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:116)
         - locked <0x00002aaaf51f1c50> (a java.lang.ref.ReferenceQueue$Lock)
         at
org.apache.commons.httpclient.MultiThreadedHttpConnectionManager$ReferenceQueueThread.run(MultiThreadedHttpConnectionManager.java:1100)

"fetcher17" prio=1 tid=0x00002aabb544fd70 nid=0x3e46 runnable
[0x0000000041ed7000..0x0000000041ed7e30]
         at java.net.SocketInputStream.socketRead0(Native Method)
         at java.net.SocketInputStream.read(SocketInputStream.java:129)
         at java.io.BufferedInputStream.fill(BufferedInputStream.java:218)
         at java.io.BufferedInputStream.read1(BufferedInputStream.java:256)
         at java.io.BufferedInputStream.read(BufferedInputStream.java:313)
         - locked <0x00002aaaf4dcc428> (a java.io.BufferedInputStream)
         at
org.apache.commons.httpclient.ContentLengthInputStream.read(ContentLengthInputStream.java:169)
         at
org.apache.commons.httpclient.ContentLengthInputStream.read(ContentLengthInputStream.java:183)
         at
org.apache.commons.httpclient.ChunkedInputStream.exhaustInputStream(ChunkedInputStream.java:368)
         at
org.apache.commons.httpclient.ContentLengthInputStream.close(ContentLengthInputStream.java:117)
         at java.io.FilterInputStream.close(FilterInputStream.java:159)
         at
org.apache.commons.httpclient.AutoCloseInputStream.notifyWatcher(AutoCloseInputStream.java:176)
         at
org.apache.commons.httpclient.AutoCloseInputStream.close(AutoCloseInputStream.java:140)
         at
org.apache.nutch.protocol.httpclient.HttpResponse.<init>(HttpResponse.java:125)
         at
org.apache.nutch.protocol.httpclient.Http.getProtocolOutput(Http.java:204)
         at
org.apache.nutch.fetcher.Fetcher$FetcherThread.run(Fetcher.java:135)

"Low Memory Detector" daemon prio=1 tid=0x00002aabb3d04ec0 nid=0x3e33
runnable [0x0000000000000000..0x0000000000000000]

"CompilerThread1" daemon prio=1 tid=0x00002aabb3d03400 nid=0x3e32
waiting on condition [0x0000000000000000..0x0000000040ac2510]

"CompilerThread0" daemon prio=1 tid=0x00002aabb3d02000 nid=0x3e31
waiting on condition [0x0000000000000000..0x00000000409c16d0]

"AdapterThread" daemon prio=1 tid=0x00002aabb3d00b90 nid=0x3e30 waiting
on condition [0x0000000000000000..0x0000000000000000]

"Signal Dispatcher" daemon prio=1 tid=0x00002aabb0cd7990 nid=0x3e2f
waiting on condition [0x0000000000000000..0x0000000000000000]

"Finalizer" daemon prio=1 tid=0x00002aabb0cc5480 nid=0x3e2e in
Object.wait() [0x00000000406bf000..0x00000000406bfe30]
         at java.lang.Object.wait(Native Method)
         - waiting on <0x00002aaaf4c44c28> (a
java.lang.ref.ReferenceQueue$Lock)
         at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:116)
         - locked <0x00002aaaf4c44c28> (a java.lang.ref.ReferenceQueue$Lock)
         at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:132)
         at java.lang.ref.Finalizer$FinalizerThread.run(Finalizer.java:159)

"Reference Handler" daemon prio=1 tid=0x00002aabb0cc22e0 nid=0x3e2d in
Object.wait() [0x00000000405be000..0x00000000405bebb0]
         at java.lang.Object.wait(Native Method)
         - waiting on <0x00002aaaf4c47640> (a java.lang.ref.Reference$Lock)
         at java.lang.Object.wait(Object.java:474)
         at java.lang.ref.Reference$ReferenceHandler.run(Reference.java:116)
         - locked <0x00002aaaf4c47640> (a java.lang.ref.Reference$Lock)

"main" prio=1 tid=0x0000000040115bc0 nid=0x3e25 waiting on condition
[0x00007fffffcfb000..0x00007fffffcfbc80]
         at java.lang.Thread.sleep(Native Method)
         at org.apache.nutch.fetcher.Fetcher.run(Fetcher.java:351)
         at org.apache.nutch.fetcher.Fetcher.main(Fetcher.java:488)

"VM Thread" prio=1 tid=0x00000000401bab30 nid=0x3e2c runnable

"GC task thread#0 (ParallelGC)" prio=1 tid=0x00000000401a85c0 nid=0x3e2a
runnable

"GC task thread#1 (ParallelGC)" prio=1 tid=0x00000000401a8c60 nid=0x3e2b
runnable

"VM Periodic Task Thread" prio=1 tid=0x00002aabb3d06c70 nid=0x3e34
waiting on condition