`
xpopi
  • 浏览: 61910 次
  • 性别: Icon_minigender_1
  • 来自: 武汉
社区版块
存档分类
最新评论

httpclient 抓取页面数据导入到Excel

 
阅读更多

最近 老有人 要我抓取页面的数据, 大部分人是要客户资料开发新的客户, 但对不了解coding 的人, 是要一个一个 的复制 ,粘贴,所以很浪费时间, 做 了个简单但demo ,用Httpclient ,jousp ,poi。

 

 


public class GetHouseData {

public final static String BASE_URL="http://example.com";

public static void main(String[] args) {

//第一页到第三页

getHoustInfoLink(3);

}

/**

 * 

 * @param n 第一页到第n页

 */

public static void getHoustInfoLink(int n) {


DefaultHttpClient httpclient = new DefaultHttpClient();

Workbook wb = new HSSFWorkbook();

try {

HttpRequestRetryHandler myRetryHandler = new HttpRequestRetryHandler() {

public boolean retryRequest(IOException exception, int executionCount,

HttpContext context) {

System.out.println("尝试连接次数:-------:" + executionCount);

if (executionCount >= 5) {

// 如果超过最大重试次数,那么就不要继续了

return false;

}

if (exception instanceof NoHttpResponseException) {

// 如果服务器丢掉了连接,那么就重试

return true;

}

if (exception instanceof SSLHandshakeException) {

// 不要重试SSL握手异常

return false;

}


HttpRequest request = (HttpRequest) context

.getAttribute(ExecutionContext.HTTP_REQUEST);

boolean idempotent = !(request instanceof HttpEntityEnclosingRequest);

if (idempotent) {

// 如果请求被认为是幂等的,那么就重试

return true;

}

return false;

}

};

for(int pageIndex =1;pageIndex<=n;pageIndex++){

httpclient.setHttpRequestRetryHandler(myRetryHandler);

List<NameValuePair> formparams = new 

ArrayList<NameValuePair>(); 

formparams.add(new BasicNameValuePair("__EVENTARGUMENT", String.valueOf(pageIndex))); 

formparams.add(new BasicNameValuePair("__EVENTTARGET", "AspNetPager1")); 

formparams.add(new BasicNameValuePair("ddlistOrder", "1")); 

UrlEncodedFormEntity urlEntity = new UrlEncodedFormEntity(formparams, "UTF-8");

HttpPost httppost = new HttpPost("BASE_URL"); 

httppost.setEntity(urlEntity); 

ResponseHandler<String> responseHandler = new BasicResponseHandler();

String responseBody = httpclient.execute(httppost, responseHandler);

Document doc = Jsoup.parse(responseBody);

Elements elements = doc.select(".modou .k2 .leb_mod");

JSONArray arr = new JSONArray();

for(int i =0 ;i<elements.size();i++){

JSONObject obj = new JSONObject();

Element element = elements.get(i);

//楼盘图片地址

String imgURL= element.select(".mod_1 img").attr("src");

//楼盘名称

String houstName = element.select(".mod_2 .zuti .a1").text();

//楼盘价格

String price = element.select(".mod_2 .zuti .b1").text();

//更新时间

String updatedTime = element.select(".mod_2 .zuti .c1").text();

//销售电话

String sellPhone = element.select(".mod_2 .dizi .a1 span").text();

element.select(".mod_2 .dizi .b1 span").remove();

String houstDeveloper = element.select(".mod_2 .dizi .b1").eq(0).text();

String address = element.select(".mod_2 .dizi .b1").eq(1).text();

obj.put("imgURL", imgURL);

obj.put("houstName", houstName);

obj.put("price", price);

obj.put("updatedTime", updatedTime);

obj.put("sellPhone", sellPhone);

obj.put("houstDeveloper", houstDeveloper);

obj.put("address", address);

arr.add(obj);

}

 

   //Workbook wb = new XSSFWorkbook();

   Sheet sheet = wb.createSheet("HouseList-page-"+pageIndex);

// Create a new font and alter it.

   Font font = wb.createFont();

   font.setFontName("Times New Roman");

   CellStyle style = wb.createCellStyle();

   style.setFont(font);

   int i =0;

   for (Object object  : arr) {

    object =arr.get(i);

    JSONObject js  =(JSONObject) object;

   

     Row row = sheet.createRow((short)i);

             Cell cell0 = row.createCell(0);

             Cell cell1 = row.createCell(1);

             Cell cell2 = row.createCell(2);

             Cell cell3 = row.createCell(3);

             Cell cell4 = row.createCell(4);

             Cell cell5 = row.createCell(5);

             Cell cell6 = row.createCell(6);

             

             cell0.setCellStyle(style);

             cell0.setCellValue(js.get("imgURL").toString());

             cell1.setCellStyle(style);

             cell1.setCellValue(js.get("houstName").toString());

             cell2.setCellValue(js.get("price").toString());

             cell3.setCellValue(js.get("updatedTime").toString());

             cell4.setCellValue(js.get("sellPhone").toString());

             cell5.setCellValue(js.get("houstDeveloper").toString());

             cell6.setCellValue(js.get("address").toString());

             

   i++;

   }

   

   System.out.println("--导入中...........页面:第"+pageIndex+"页");

   //睡眠一秒

   Thread.sleep(1000);

   

}

   

   

   FileOutputStream fileOut = new FileOutputStream("house-"+System.currentTimeMillis()+".xls");

   wb.write(fileOut);

   fileOut.close();

// System.out.println(elements.html());

// Struts2Utils.getResponse().setCharacterEncoding("UTF-8");

} catch (Exception e) {

e.printStackTrace();

} finally {

httpclient.getConnectionManager().shutdown();

}


}

}


 

分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics