美文网首页
使用爬虫代理采集网站失败的解决方法

使用爬虫代理采集网站失败的解决方法

作者: 数据的海洋 | 来源:发表于2021-07-09 15:43 被阅读0次

爬虫程序采集网站必须使用动态代理,才能避免出现网站访问频繁的限制,这是众所周知的。但是在具体采集网站的过程中,即使使用了动态代理依然会出现403、503或429的反爬错误,这是为什么呢?根据以往的经验,一般来说是因为以下几个原因造成的:

1、动态User-Agent的修改

爬虫程序采集网站,正常的HTTP请求都需要进行ua(User-Agent)优化,因为ua是浏览器标识,如果http请求没有ua,甚至有些爬虫程序主动标示为采集,那么目标网站拒绝采集的可能性很高

2、控制单个代理IP的请求频率

虽然爬虫程序使用了动态代理,但是如果程序的多线程控制实现不好,会导致单个代理IP在短时间内发出大量的请求,导致该IP被访问频繁

3、IP有效时间的管理

动态代理IP使用过程中,必须进行存活检查,一旦发现延迟较高、带宽很低的代理IP,应该主动丢弃,避免使用过程中出现超时的情况

如果觉得上面的工作太麻烦,推荐使用自动转发的爬虫代理加强版,这种产品能实现每个http请求自动分配不同的代理IP转发,同时进行IP池的自动多线程管理,确保了请求联通率99%以上同时延迟低于300ms,可以快速上手采集网站,下面是产品demo可以直接复制使用,配置代理参数(proxyHost、proxyPort、proxyUser、proxyPass)和目标网站(targetUrl)就可以Run:

import java.io.BufferedReader;

import java.io.InputStreamReader;

import java.io.IOException;

import java.net.URI;

import java.util.Arrays;

import java.util.ArrayList;

import java.util.HashSet;

import java.util.List;

import java.util.Set;

import org.apache.http.Header;

import org.apache.http.HeaderElement;

import org.apache.http.HttpHost;

import org.apache.http.auth.AuthScope;

import org.apache.http.auth.UsernamePasswordCredentials;

import org.apache.http.client.AuthCache;

import org.apache.http.client.CredentialsProvider;

import org.apache.http.client.HttpRequestRetryHandler;

import org.apache.http.client.config.RequestConfig;

import org.apache.http.client.config.AuthSchemes;

import org.apache.http.client.entity.GzipDecompressingEntity;

import org.apache.http.client.entity.UrlEncodedFormEntity;

import org.apache.http.client.methods.CloseableHttpResponse;

import org.apache.http.client.methods.HttpGet;

import org.apache.http.client.methods.HttpPost;

import org.apache.http.client.methods.HttpRequestBase;

import org.apache.http.client.protocol.HttpClientContext;

import org.apache.http.config.Registry;

import org.apache.http.config.RegistryBuilder;

import org.apache.http.conn.socket.ConnectionSocketFactory;

import org.apache.http.conn.socket.LayeredConnectionSocketFactory;

import org.apache.http.conn.socket.PlainConnectionSocketFactory;

import org.apache.http.conn.ssl.SSLConnectionSocketFactory;

import org.apache.http.impl.auth.BasicScheme;

import org.apache.http.impl.client.BasicAuthCache;

import org.apache.http.impl.client.BasicCredentialsProvider;

import org.apache.http.impl.client.CloseableHttpClient;

import org.apache.http.impl.client.HttpClients;

import org.apache.http.impl.client.ProxyAuthenticationStrategy;

import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;

import org.apache.http.message.BasicHeader;

import org.apache.http.message.BasicNameValuePair;

import org.apache.http.NameValuePair;

import org.apache.http.util.EntityUtils;

public class Demo

{

    // 代理服务器(产品官网 www.16yun.cn)

    final static String proxyHost = "t.16yun.cn";

    final static Integer proxyPort = 31000;

    // 代理验证信息

    final static String proxyUser = "username";

    final static String proxyPass = "password";

    private static PoolingHttpClientConnectionManager cm = null;

    private static HttpRequestRetryHandler httpRequestRetryHandler = null;

    private static HttpHost proxy = null;

    private static CredentialsProvider credsProvider = null;

    private static RequestConfig reqConfig = null;

    static {

        ConnectionSocketFactory plainsf = PlainConnectionSocketFactory.getSocketFactory();

        LayeredConnectionSocketFactory sslsf = SSLConnectionSocketFactory.getSocketFactory();

        Registry registry = RegistryBuilder.create()

            .register("http", plainsf)

            .register("https", sslsf)

            .build();

        cm = new PoolingHttpClientConnectionManager(registry);

        cm.setMaxTotal(20);

        cm.setDefaultMaxPerRoute(5);

        proxy = new HttpHost(proxyHost, proxyPort, "http");

        credsProvider = new BasicCredentialsProvider();

        credsProvider.setCredentials(AuthScope.ANY, new UsernamePasswordCredentials(proxyUser, proxyPass));

        reqConfig = RequestConfig.custom()

            .setConnectionRequestTimeout(5000)

            .setConnectTimeout(5000)

            .setSocketTimeout(5000)

            .setExpectContinueEnabled(false)

            .setProxy(new HttpHost(proxyHost, proxyPort))

            .build();

    }

    public static void doRequest(HttpRequestBase httpReq) {

        CloseableHttpResponse httpResp = null;

        try {

            setHeaders(httpReq);

            httpReq.setConfig(reqConfig);

            CloseableHttpClient httpClient = HttpClients.custom()

                .setConnectionManager(cm)

                .setDefaultCredentialsProvider(credsProvider)

                .build();

            AuthCache authCache = new BasicAuthCache();

            authCache.put(proxy, new BasicScheme());

            HttpClientContext localContext = HttpClientContext.create();

            localContext.setAuthCache(authCache);

            httpResp = httpClient.execute(httpReq, localContext);

            int statusCode = httpResp.getStatusLine().getStatusCode();

            System.out.println(statusCode);

            BufferedReader rd = new BufferedReader(new InputStreamReader(httpResp.getEntity().getContent()));

            String line = "";

            while((line = rd.readLine()) != null) {

                System.out.println(line);

            }

        } catch (Exception e) {

            e.printStackTrace();

        } finally {

            try {

                if (httpResp != null) {

                    httpResp.close();

                }

            } catch (IOException e) {

                e.printStackTrace();

            }

        }

    }

    /**

    * 设置请求头

    *

    * @param httpReq

    */

    private static void setHeaders(HttpRequestBase httpReq) {

        // 设置Proxy-Tunnel

        // Random random = new Random();

        // int tunnel = random.nextInt(10000);

        // httpReq.setHeader("Proxy-Tunnel", String.valueOf(tunnel));

        httpReq.setHeader("Accept-Encoding", null);

    }

    public static void doGetRequest() {

        // 要访问的目标页面

        String targetUrl = "https://httpbin.org/ip";

        try {

            HttpGet httpGet = new HttpGet(targetUrl);

            doRequest(httpGet);

        } catch (Exception e) {

            e.printStackTrace();

        }

    }

    public static void main(String[] args) {

        doGetRequest();

    }

}

相关文章

网友评论

      本文标题:使用爬虫代理采集网站失败的解决方法

      本文链接:https://www.haomeiwen.com/subject/pppfpltx.html