`

java 抓取网页图片

阅读更多
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 
 * @author 方小洲
 * 
 * 2013-8-5 上午8:46:54
 */
public class ImageParse {

	/**
	 * 根据Url地址获取图片地址
	 * @param url
	 * @return
	 * @throws MalformedURLException 
	 */
	public List<String> getImagesPath(String url) throws Exception {
		
		List<String> imagePaths = new ArrayList<String>();
		
		String htmlCode = getHtmlCode(url);
		String imgRegs1 = "(?x)(src|SRC|background|BACKGROUND)=('|\")/?(([\\w-]+/)*([\\w-]+\\.(jpg|JPG|png|PNG|gif|GIF)))('|\")";
		String imgRegs2 = "(?x)(src|SRC|background|BACKGROUND)=('|\")(http://([\\w-]+\\.)+[\\w-]+(:[0-9]+)*(/[\\w-]+)*(/[\\w-]+\\.(jpg|JPG|png|PNG|gif|GIF)))('|\")";
		
		
		//针对没有全路径的,不带url地址的
		Pattern pattern = Pattern.compile(imgRegs1);
		Matcher matcher = pattern.matcher(htmlCode);
		while (matcher.find()) {
			imagePaths.add(url + "/" + matcher.group(3));
			//System.out.println(url + "/" + matcher.group(3));
		}
		
		
		//针对全路径的,不带url地址的
		pattern = Pattern.compile(imgRegs2);
		matcher = pattern.matcher(htmlCode);
		while (matcher.find()) {
			imagePaths.add(matcher.group(3));
			//System.out.println(matcher.group(3));
		}
		
		return imagePaths ;
	}

	/**
	 * 根据URL地址获取网页代码
	 * @param url
	 * @throws Exception 
	 * @return
	 */
	public String getHtmlCode(String url) throws Exception {
		StringBuffer sbf = new StringBuffer();
		URL httpUrl = new URL(url);
		BufferedReader reader = new  BufferedReader(new InputStreamReader(httpUrl.openStream()));
		String line = "" ;
		while((line = reader.readLine()) != null){
			//System.out.println(line);
			sbf.append(line);
		}
		return sbf.toString();
	}
	
	
	/**
	 * 根据URL地址下载图片
	 * @param targetUrl    目标网址
	 * @param outputPath   生成的文件目录
	 * @throws Exception 
	 */
	public void downLoadImages(String targetUrl , String outputPath) throws Exception{
		List<String> imagePaths =  getImagesPath(targetUrl);
		for (String imagePath : imagePaths) {
			generatorImageBathByUrl(imagePath , outputPath);
		}
	}
	
	/**
	 * 下载图片
	 * @param imagePath
	 * @param outputPath
	 * @throws Exception 
	 */
	public void generatorImageBathByUrl(String imagePath , String outputPath) throws Exception{
		
		//outputPath = outputPath + "/" + imagePath.substring(imagePath.lastIndexOf("/"));
		outputPath = outputPath + "/" + System.currentTimeMillis() + imagePath.substring(imagePath.lastIndexOf("."));
		
		URL imageUrl = new URL(imagePath);
		BufferedInputStream bis = new BufferedInputStream(imageUrl.openStream());
		FileOutputStream fos = new FileOutputStream(new File(outputPath));
		int pos ;
		while((pos = bis.read()) != -1) {
			fos.write(pos);
		}
		
		fos.close();
		bis.close();
			
	
	} 
	
	
	
	 
	public static void main(String[] args) throws Exception {
		ImageParse parse = new ImageParse();
		parse.downLoadImages("http://www.fjboda.cn","d:\\image");
	}
	
}

 

分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics