从网页源文件中得到链接

  • 来源: 编程中国 作者: 若水   2008-03-27/11:56
  • import java.net.*;
    import java.io.*;
    import java.util.*;

    class ScanPage{
        private static String strPage;
        private String strUrl;
        private String fileName;
        public void setURL(String strUrl){
            this.strUrl=strUrl;
        }
        //从地址中得到文件名
        public void setFileName(){
            int i;
            char ch;
            i=strUrl.length();
            ch=strUrl.charAt(--i);
            while(ch!='/' && ch>0)
                ch=strUrl.charAt(--i);
            fileName=strUrl.substring(i);
        }
        //下载网页
        public void downFile()throws IOException{
            URL url =new URL(strUrl);
            InputStream is =url.openStream();
            OutputStream os =new FileOutputStream(fileName);
            byte[] buffer =new byte[512];
            int len;
            while((len =is.read(buffer))!=-1)
                os.write(buffer,0,len);
            is.close();
            os.close();
        }
        //读文件
        public void readFile() throws IOException {
            StringBuffer sb = new StringBuffer();
            BufferedReader in =new BufferedReader(new FileReader(fileName));
            String s;
            while((s = in.readLine()) != null) {
                sb.append(s);
                sb.append("\n");
            }
            in.close();
            strPage=sb.toString();
        }
        public String getTitle(){
            
            return "";
        }
        //扫描标签,得到资源
        public void scanLabel(ArrayList al,String strLabel,String strType){
            int idx;   //返回下标
            String strTmp;
            strPage=strPage.toLowerCase();  //转换为小写,以便后面比较
            idx = strPage.indexOf("<body");
            while(idx!=-1){
                idx=strPage.indexOf(strLabel,idx);
                if (idx==-1)
                    break;
                else{
                    int i=0;
                    idx=idx+strLabel.length();
                    for(;strPage.charAt(idx+i)!='>' && strPage.charAt(idx+i)!=32;++i);
                    strTmp=strPage.substring(idx,idx+i);
                    idx=idx+i;
                    //去掉首尾引号
                    if (strTmp.charAt(0)=='\"')
                        strTmp=strTmp.substring(1);#p#分页标题#e#
                    if (strTmp.charAt(strTmp.length()-1)=='\"')
                        strTmp=strTmp.substring(0,strTmp.length()-1);
                //判断是否是用户需要的类型
                    if (strType.equals("*"))
                        al.add(strTmp);
                    else{
                        String right;
                        if (strTmp.length()>=strType.length()){
                            right=strTmp.substring(strTmp.length()-strType.length());
                            right=right.toLowerCase();
                            if (right.equals(strType))
                                al.add(strTmp);
                        }
                    }
                }
            }
        }
    }

    class ScanApp{
        public static void main(String[] args){
            ArrayList al=new ArrayList();
            ScanPage sp=new ScanPage();
            sp.setURL(args[0]);
            sp.setFileName();
            try{
                sp.downFile();
                sp.readFile();
            }catch(IOException ie){System.out.println("文件操作出错");};
            sp.scanLabel(al,"<a href=",".html");
            for(int i=0;i<al.size();i++){
                System.out.println(al.get(i));
            }
        }
    }

    E:\javawork>java ScanApp http://www.17kyk.com/Html/Book/16/2431/list.html
    list.html
    429400.html
    429401.html
    439789.html
    429403.html
    429404.html
    429405.html
    429406.html
    429407.html
    429408.html
    429409.html
    429411.html
    429412.html
    说明:
    scanLabel针对<a href和<img src这2个标签进行扫描.
    得到所有链接存储在ArrayList中


    scanLabel(al,"<a href=","*");
    scanLabel(al,"<img src=","*");
    得到特定的链接
    scanLabel(al,"<a href=",".html");
    scanLabel(al,"<a href=",".asp");
    scanLabel(al,"<img src=",".gif");

     


    评论 {{userinfo.comments}}

    {{money}}

    {{question.question}}

    A {{question.A}}
    B {{question.B}}
    C {{question.C}}
    D {{question.D}}
    提交

    驱动号 更多