# 0827--算法练习题

import java.io.*;
import java.util.*;

public class URL1{
	public HashMap<String,Integer> parseURLs(String path){
		HashMap<String,Integer> map=new HashMap<String,Integer>();
		try{
			BufferedReader br=new BufferedReader(new FileReader(path));
			String str,url;
			int count,len;
			
			while((str=br.readLine())!=null){
				int index;
				if((index=str.indexOf('?'))!=-1){
					str=str.substring(0,index);
				}
				
				String[] tmp=str.split("/");
			  len=tmp.length;
				if(len==3){
					url="null";
				}else{
					url=tmp[len-1];
				}
				
				if(map.containsKey(url)){
					count=map.get(url).intValue();
				}else{
					count=0;
				}
				map.put(url,count+1);
			}
			
		}catch(Exception ex){
		}
					
		return map;
	}
				
	public static void main(String[] args){
		URL1 url1=new URL1();
		HashMap<String,Integer> map=url1.parseURLs("url1.txt");
	
	 /*	Iterator it=map.entrySet().iterator();          //注意hashMap的遍历方法
		while(it.hasNext()){
			Map.Entry<String,Integer> me=(Map.Entry)it.next();
			System.out.println(me.getKey()+","+me.getValue());
		}
	
		System.out.println("--------------");
	 */
	 
		System.out.println(map);
	}

}

awk -F / '{for(i=0;i<NF;i++){n=index($i,"?"); if(n!=0){$i=substr($i,0,n); count[$i]++; break;}} if(i==NF){if($(i-1)=="") {count["null"]++;}  else{count[$(i-1)]++;}}} END{for(file in count){print file,":",count[file]}}' file

测试数据：

http://www.test.com/abc/de/fg.php?id=1&url=http://www.test.com/index.html
http://www.ceshi.com/hi.jsp
ftp://ftp.ceshi.com/hi.jsp
http://www.hello.com/cw/hi.jsp?k=8
http://www.hi.com/jk/l.html?id=1&s=a.html
http://www.rs.com/n.op/q/rs?id=1
http://www.abc.com/

2. 编写一个C语言函数，要求输入一个url，输出该url是首页、目录页或者其他url
如下形式叫做首页：
militia.info/
www.apcnc.com.cn/
http://www.cyjzs.comwww.greena888.com/
www.800cool.net/
http://hgh-products.my-age.net/
如下形式叫做目录页：
thursdaythree.net/greenhouses--gas-global-green-house-warming/
http://www.mw.net.tw/user/tgk5ar1r/profile/
http://www.szeasy.com/food/yszt/chunjie/
www.fuckingjapanese.com/Reality/

请注意：
a） url有可能带http头也有可能不带
b）动态url（即含有"？"的url）的一律不算目录页，如：
www.buddhismcity.net/utility/mailit.php?l=/activity/details/3135/
www.buddhismcity.net/utility/mailit.php?l=/activity/details/2449/

另：如果你会linux，请用linux下的grep命令实现第2题的功能（附加5分）。

public class URL2{
	public void parseURL(String url){
		int index,len,firLen;
		if(url.indexOf('?')!=-1){
			System.out.println(url+": 其他url");
			return;
		}else{
			String[] tmp=url.split("/");
			len=tmp.length;
			
			if((index=url.indexOf("http://"))!=-1){
				firLen=3;
			}else{
				firLen=1;
			}
		
			if(len==firLen){
				System.out.println(url+": 主页");
				return;
			}else if(len>firLen){
				if((tmp[len-1].indexOf('.'))==-1){
					System.out.println(url+": 目录页");
					return;
				}else{
					System.out.println(url+": 其他url");
			   	return;
			  }
			}
		}
	}
	
	public static void main(String[] args){
		URL2 url2=new URL2();
		url2.parseURL("http://www.cyjzs.comwww.greena888.com/");
		url2.parseURL("www.apcnc.com.cn/");
		url2.parseURL("thursdaythree.net/greenhouses--gas-global-green-house-warming/");
		url2.parseURL("www.buddhismcity.net/utility/mailit.php?l=/activity/details/3135/");
		url2.parseURL("www.buddhismcity.net/utility/mailit.php");
	}
	
}

awk -F / '{if((index($0,"?"))!=0){print "其他url";}else{if((index($0,"http://")!=0){len1=3;}else{len1=1;}if(NF==len1) {print "首页";}else{if((index($(NF-1),".")!=0){print "其他url"}else{print "目录页"}}}' file

3. 对给定的上亿条无序的url，请按照domain、site以及path分别排序，并请指出排序过程中可能会遇到的哪些问题？如何提高效率？

例如：http://www.baidu.com/path/about.html，domain、site以及path的定义分别如下：

Domain:baidu.com

Site:www.baidu.com

Path: www.baidu.com/path

分享到：

？？？0828--存储空间管理器+n选m问题 | ！！！0826--图

2010-08-25 14:12
浏览 778
评论(0)
分类:编程语言
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论