[jira] [Commented] (NUTCH-2648) Make configurable whether TLS/SSL certificates are checked by protocol plugins

classic Classic list List threaded Threaded
1 message Options
Reply | Threaded
Open this post in threaded view
|

[jira] [Commented] (NUTCH-2648) Make configurable whether TLS/SSL certificates are checked by protocol plugins

JIRA jira@apache.org

    [ https://issues.apache.org/jira/browse/NUTCH-2648?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16643378#comment-16643378 ]

ASF GitHub Bot commented on NUTCH-2648:
---------------------------------------

sebastian-nagel closed pull request #388:  NUTCH-2648 Make configurable whether TLS/SSL certificates are checked by protocol plugins
URL: https://github.com/apache/nutch/pull/388
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 9f57af26e..065ed86fe 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -249,6 +249,18 @@
   </description>
 </property>
 
+<property>
+  <name>http.tls.certificates.check</name>
+  <value>false</value>
+  <description>
+    Whether to check the TLS/SSL server certificates for validity.
+    If true invalid (e.g., self-signed or expired) certificates are
+    rejected and the https connection is failed.  If false insecure
+    TLS/SSL connections are allowed.  Note that this property is
+    currently not supported by all http/https protocol plugins.
+  </description>
+</property>
+
 <property>
   <name>http.proxy.host</name>
   <value></value>
diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
index 42f479312..a5c0a90f1 100644
--- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
+++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
@@ -157,6 +157,9 @@
   /** Skip page if Crawl-Delay longer than this value. */
   protected long maxCrawlDelay = -1L;
 
+  /** Whether to check TLS/SSL certificates */
+  protected boolean tlsCheckCertificate = false;
+
   /** Which TLS/SSL protocols to support */
   protected Set<String> tlsPreferredProtocols;
 
@@ -206,6 +209,8 @@ public void setConf(Configuration conf) {
     // backward-compatible default setting
     this.useHttp11 = conf.getBoolean("http.useHttp11", true);
     this.useHttp2 = conf.getBoolean("http.useHttp2", false);
+    this.tlsCheckCertificate = conf.getBoolean("http.tls.certificates.check",
+        false);
     this.responseTime = conf.getBoolean("http.store.responsetime", true);
     this.storeIPAddress = conf.getBoolean("store.ip.address", false);
     this.storeHttpRequest = conf.getBoolean("store.http.request", false);
@@ -496,6 +501,10 @@ public boolean getUseHttp11() {
     return useHttp11;
   }
 
+  public boolean isTlsCheckCertificates() {
+    return tlsCheckCertificate;
+  }
+
   public Set<String> getTlsPreferredCipherSuites() {
     return tlsPreferredCipherSuites;
   }
diff --git a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
index 95ae35248..b4d3fbcb9 100644
--- a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
+++ b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
@@ -355,10 +355,17 @@ public Metadata getHeaders() {
    * -------------------------
    */
 
-  private SSLSocket getSSLSocket(Socket socket, String sockHost, int sockPort) throws Exception {
-    SSLContext sslContext = SSLContext.getInstance("TLS");
-    sslContext.init(null, new TrustManager[]{new DummyX509TrustManager(null)}, null);
-    SSLSocketFactory factory = sslContext.getSocketFactory();
+  private SSLSocket getSSLSocket(Socket socket, String sockHost, int sockPort)
+      throws Exception {
+    SSLSocketFactory factory;
+    if (http.isTlsCheckCertificates()) {
+      factory = (SSLSocketFactory) SSLSocketFactory.getDefault();
+    } else {
+      SSLContext sslContext = SSLContext.getInstance("TLS");
+      sslContext.init(null,
+          new TrustManager[] { new DummyX509TrustManager(null) }, null);
+      factory = sslContext.getSocketFactory();
+    }
     
     SSLSocket sslsocket = (SSLSocket) factory
       .createSocket(socket, sockHost, sockPort, true);
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
index c185f9bdc..2cd29d3e9 100644
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
+++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
@@ -47,7 +47,7 @@
 import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
 import org.apache.commons.httpclient.protocol.Protocol;
 import org.apache.commons.httpclient.protocol.ProtocolSocketFactory;
-
+import org.apache.commons.httpclient.protocol.SSLProtocolSocketFactory;
 import org.apache.commons.lang.StringUtils;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.net.protocols.Response;
@@ -184,8 +184,12 @@ protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
   private void configureClient() {
 
     // Set up an HTTPS socket factory that accepts self-signed certs.
-    // ProtocolSocketFactory factory = new SSLProtocolSocketFactory();
-    ProtocolSocketFactory factory = new DummySSLProtocolSocketFactory();
+    ProtocolSocketFactory factory;
+    if (tlsCheckCertificate) {
+      factory = new SSLProtocolSocketFactory();
+    } else {
+      factory = new DummySSLProtocolSocketFactory();
+    }
     Protocol https = new Protocol("https", factory, 443);
     Protocol.registerProtocol("https", https);
 
diff --git a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java
index f2558b9dc..d5018df87 100644
--- a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java
+++ b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java
@@ -16,8 +16,8 @@
  */
 package org.apache.nutch.protocol.okhttp;
 
-import java.lang.invoke.MethodHandles;
 import java.io.IOException;
+import java.lang.invoke.MethodHandles;
 import java.net.InetAddress;
 import java.net.InetSocketAddress;
 import java.net.Proxy;
@@ -25,6 +25,7 @@
 import java.net.SocketAddress;
 import java.net.URI;
 import java.net.URL;
+import java.security.cert.CertificateException;
 import java.util.ArrayList;
 import java.util.Base64;
 import java.util.LinkedList;
@@ -32,16 +33,21 @@
 import java.util.Locale;
 import java.util.concurrent.TimeUnit;
 
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+import javax.net.ssl.HostnameVerifier;
+import javax.net.ssl.SSLContext;
+import javax.net.ssl.SSLSession;
+import javax.net.ssl.SSLSocketFactory;
+import javax.net.ssl.TrustManager;
+import javax.net.ssl.X509TrustManager;
 
 import org.apache.hadoop.conf.Configuration;
-
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.net.protocols.Response;
 import org.apache.nutch.protocol.ProtocolException;
 import org.apache.nutch.protocol.http.api.HttpBase;
 import org.apache.nutch.util.NutchConfiguration;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import okhttp3.Authenticator;
 import okhttp3.Connection;
@@ -59,6 +65,41 @@
 
   private OkHttpClient client;
 
+  private static final TrustManager[] trustAllCerts = new TrustManager[] {
+      new X509TrustManager() {
+        @Override
+        public void checkClientTrusted(
+            java.security.cert.X509Certificate[] chain, String authType)
+            throws CertificateException {
+        }
+
+        @Override
+        public void checkServerTrusted(
+            java.security.cert.X509Certificate[] chain, String authType)
+            throws CertificateException {
+        }
+
+        @Override
+        public java.security.cert.X509Certificate[] getAcceptedIssuers() {
+          return new java.security.cert.X509Certificate[] {};
+        }
+      } };
+
+  private static final SSLContext trustAllSslContext;
+
+  static {
+    try {
+      trustAllSslContext = SSLContext.getInstance("SSL");
+      trustAllSslContext.init(null, trustAllCerts,
+          new java.security.SecureRandom());
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  private static final SSLSocketFactory trustAllSslSocketFactory = trustAllSslContext
+      .getSocketFactory();
+
   public OkHttp() {
     super(LOG);
   }
@@ -81,6 +122,17 @@ public void setConf(Configuration conf) {
         .writeTimeout(timeout, TimeUnit.MILLISECONDS)
         .readTimeout(timeout, TimeUnit.MILLISECONDS);
 
+    if (!tlsCheckCertificate) {
+      builder.sslSocketFactory(trustAllSslSocketFactory,
+          (X509TrustManager) trustAllCerts[0]);
+      builder.hostnameVerifier(new HostnameVerifier() {
+        @Override
+        public boolean verify(String hostname, SSLSession session) {
+          return true;
+        }
+      });
+    }
+
     if (!accept.isEmpty()) {
       getCustomRequestHeaders().add(new String[] { "Accept", accept });
     }


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[hidden email]


> Make configurable whether TLS/SSL certificates are checked by protocol plugins
> ------------------------------------------------------------------------------
>
>                 Key: NUTCH-2648
>                 URL: https://issues.apache.org/jira/browse/NUTCH-2648
>             Project: Nutch
>          Issue Type: Improvement
>          Components: protocol
>    Affects Versions: 1.15
>            Reporter: Sebastian Nagel
>            Priority: Minor
>             Fix For: 1.16
>
>
> (see discussion in NUTCH-2647)
> It should be possible to enable/disable TLS/SSL certificate validation centrally for all http/https protocol plugins by a single configuration property.
> Some use cases (eg. crawl a site to detect insecure pages) may require that TLS/SSL certificates are checked. Also a broader, unrestricted web crawl may skip sites with invalid certificates as this is can be an indicator for the quality of a site.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)