`
bdk82924
  • 浏览: 551442 次
  • 性别: Icon_minigender_1
  • 来自: 南京
社区版块
存档分类
最新评论

HttpClient 网络抓取

 
阅读更多

利用 HttpClient 进行抓取 ,有如下几点要注意的

1、抓取普通页面,注意中文问题

2、抓取需要登录的页面,有验证码 和无验证码区分

3、防抓取页面,如Javaeye

4、设置代理服务器

5、解析抓取后的内容,解析可以 用Jsoup,文档:http://www.open-open.com/jsoup/

 

 代码:

 

  public final static void main(String[] args) throws Exception
    {
        try
        {

            String url = "http://bdk82924.iteye.com/admin/blogs/1329405";

            HttpClient client = new HttpClient();
            PostMethod post = new PostMethod(url);
            // 设置http头
            post.setRequestHeader(new Header("Content-type", "text/xml; charset=\"utf-8\""));
            post.setRequestHeader(new Header("User-Agent", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)"));
            // 设置代理服务器
            client.getHostConfiguration().setProxy("proxy.com", 80);

            int result = client.executeMethod(post);
            if (result != 200)
            {
                // 失败的响应代码
                System.out.println("获取失败,rspCode:" + result);
            }

            System.out.println(new String(post.getResponseBody(), "UTF-8"));
        } catch (Exception e)
        {
            e.printStackTrace();
        }
    }

 

2、抓取需要登录的页面

 

 public class Client
{
 

    private static Cookie[] cookies;

    /**
     * 的到Http请求结果
     *
     * @param url
     *            请求地址
     * @param parms
     *            请求参数
     * @return
     */
    public static void doLogin(String url, Map parms)
    {
        byte[] body = new byte[0];
        String str = null;
        // 构造HttpClient的实例
        HttpClient client = new HttpClient();
        // 创建GET方法的实例
        PostMethod postMethod = new PostMethod(url);
        // 填入各个表单域的值
        NameValuePair[] data = new NameValuePair[parms.keySet().size()];
        Iterator it = parms.entrySet().iterator();
        int i = 0;
        while (it.hasNext())
        {
            Map.Entry entry = (Map.Entry) it.next();
            Object key = entry.getKey();
            Object value = entry.getValue();
            data[i] = new NameValuePair(key.toString(), value.toString());
            i++;
        }
        // 将表单的值放入postMethod中
        postMethod.setRequestBody(data);
        try
        {
            // 执行postMethod
            int statusCode = client.executeMethod(postMethod);
            // HttpClient对于要求接受后继服务的请求,象POST和PUT等不能自动处理转发
            // 301或者302
            if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY)
            {
                // 从头中取出转向的地址
                Header locationHeader = postMethod.getResponseHeader("location");
                String location = null;
                if (locationHeader != null)
                {
                    location = locationHeader.getValue();
                } else
                {
                    System.err.println("Location field value is null.");
                }
            }
            cookies = client.getState().getCookies();
            client.getState().addCookies(cookies);

        } catch (Exception e)
        {
            e.printStackTrace();
        }
    }

    /**
     * 的到Http请求结果
     *
     * @param url
     *            请求地址
     * @param parms
     *            请求参数
     * @return
     */
    public static byte[] getURL(String url)
    {
        byte[] body = new byte[0];
        // 构造HttpClient的实例
        HttpClient client = new HttpClient();
        // 创建GET方法的实例
        GetMethod get = new GetMethod(url);

        get.setRequestHeader("Cookie", cookies.toString());
        try
        {
            client.executeMethod(get);
            body = get.getResponseBody();
        } catch (Exception e)
        {
            e.printStackTrace();
        }

        return body;
    }

    public static void main(String[] args) throws UnsupportedEncodingException
    {
   String loginURL = "http://XX/login.do";
   String testURL = "http://XX/a.do";
 
        Map map = new HashMap();
        map.put("username", "XX");
        map.put("passwd", "XX");
        doLogin(loginURL, map);

        String s = new String(getURL(testURL), "utf-8");
        System.out.println(s);

    }

}

 

 

 

分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics