用HtmlUnit和httpClient抓施华洛世奇网站图片和动画<二>

suiyuan0808

浏览: 152454 次
性别:
来自: 杭州

最近访客更多访客>>

morelily

roketabc

kingtsing

aaron198

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

Java

thread

try
    {
     mainPage = webClient.getPage(url);
    } catch (Exception e)
    {
     log.error(e.getMessage(), e);
    }
    if (mainPage != null)
    {
     HtmlElement paginationContainer = mainPage
       .getElementById("paginationContainer");
     if (paginationContainer != null)
     {
      url = url + "/all-1";
      try
      {
       mainPage = webClient.getPage(url);
      } catch (Exception e)
      {
       log.error(e.getMessage(), e);
      }
     }
HtmlElement categories = mainPage.getElementById("categories");
     if (categories == null)
     {
      HtmlElement products = mainPage.getElementById("products");
      if(products!=null)
      {
       List<HtmlElement> productDivList = products.getElementsByTagName("div");
       if(productDivList!=null&&productDivList.size()>0)
       {
        for(HtmlElement proDiv:productDivList)
        {
         List<HtmlElement> subList = proDiv.getElementsByTagName("div");
         if(subList!=null)
         {
          for(HtmlElement dt:subList)
          {
           String classt = dt.getAttribute("class");
           if (classt != null && classt.equals("productImg"))
           {
            List<HtmlElement> subAList = proDiv.getElementsByTagName("a");
            if(subAList!=null&&subAList.size()>0)
            {
             HtmlElement ae=subAList.get(0);
             String href = ae.getAttribute("href");
             thirdPageUrls.add(href);
            }
            break;
           }
          }
         }
        }
       }
      }
     }
     else
     {
      List<HtmlElement> divList = categories.getElementsByTagName("div");
         if (divList != null && divList.size() > 0)
         {
       for (HtmlElement div : divList)
       {
        List<HtmlElement> tempDivs = div
          .getElementsByTagName("div");
        if (tempDivs != null && tempDivs.size() > 0)
        {
         for (HtmlElement div1 : tempDivs)
         {
          List<HtmlElement> aList = div1
            .getElementsByTagName("a");
          HtmlAnchor a = (HtmlAnchor) aList
            .get(0);
          String link = a.getHrefAttribute();
          secondPageUrls.add(link);

}

        }
       }
      }
     }

    }
   }
  }
  log.error("第二层抓取结束..........");
  log.error("目前抓取到的第二层URL个数为:"+secondPageUrls.size());

int count=0;

  for (String url : secondPageUrls)
  {
   count++;
   log.error("正在抓取第二层的第"+count+"个URL:"+url);
   HtmlPage mainPage = null;
   try
   {
    mainPage = webClient.getPage(url);
   } catch (Exception e)
   {
    log.error(e.getMessage(), e);
   }
   log.error("抓取URL完成:"+url+",正在分析URL"+url+"+结果的URL");
   if (mainPage != null)
   {
    HtmlElement paginationContainer = mainPage
      .getElementById("paginationContainer");
    if (paginationContainer != null)
    {
     url = url + "/all-1";
    }
    HtmlElement products = mainPage.getElementById("products");
    if (products != null)
    {
     List<HtmlElement> list = products
       .getElementsByTagName("div");
     ;
     if (list == null || list.size() == 0)
     {
      continue;
     }
     for (HtmlElement h : list)
     {
      String cls = h.getAttribute("class");
      if (cls == null || !cls.equals("productName"))
      {
       continue;
      }
      List<HtmlElement> links = h.getElementsByTagName("a");
      if (links != null && links.size() > 0)
      {
       HtmlAnchor htmlAnchor = (HtmlAnchor) links.get(0);
       String linkStr = htmlAnchor.getHrefAttribute();
       thirdPageUrls.add(linkStr);
       log.error(linkStr);
      }

}
}

   }
  }
  log.error("第二层抓取结束..........");
  secondPageUrls.clear();
  secondPageUrls = null;

   count=0;
  log.error("目前抓取到的第三层URL个数为:"+thirdPageUrls.size());
  String urlPrix="http://www.swarovski-crystallized.com/jewelry/us/";
for (String url : thirdPageUrls)
  {
   count++;
   log.error("正在抓取第三层的第"+count+"个URL:"+url);
   HtmlPage mainPage = null;
   try
   {
    mainPage = webClient.getPage(url);
   } catch (Exception e)
   {
    log.error(e.getMessage(), e);
   }
   if (mainPage != null)
   {
    log.error("抓取URL完成:"+url+",正在分析URL"+url+"+结果");
    int indexC=url.indexOf(urlPrix);
    int indexD=url.indexOf("?");
    String dirStr=url.substring(indexC+urlPrix.length(), indexD);
    String regEx = "/";
//    Pattern p = Pattern.compile(regEx);
//    Matcher m = p.matcher(dirStr);

//哦哦哦，建立文件夹准备把抓到数据放在里面
    dirStr=replece( regEx,"\\\\",dirStr);
    dirStr = "D:\\swaroski\\"+dirStr;
    File   file   =   new   File(dirStr);
    if(file.isDirectory())
    {
     dirStr=dirStr+"\\"+count;
     file   =   new   File(dirStr);
    }

file.mkdirs();

Product product=new Product();
    product.setLocalDir(dirStr);

    product.setPageUrl(url);
    HtmlElement rightCol = mainPage.getElementById("rightCol");
    String title = null;
    String description = null;
    String packingUnit = null;
    if (rightCol != null)
    {
     HtmlElement headlineDiv = rightCol
       .getElementById("headline");
     if (headlineDiv != null)

。。。。。。。。。。。。。。。。。。。。。。。。

log.error("完成:"+url+",分析结果");
     try
     {
      swaroSkiDAO.addProduct(product);
     } catch (Exception e)
     {
      log.error(e.getMessage(), e);
     }
     log.error("完成保存结果");

     for(String downloadUrl:resourceUrlList)
     {
      int index6=downloadUrl.lastIndexOf("/");
      String fileName=downloadUrl.substring(index6+1);
      String dirStr2=dirStr+"\\"+fileName;
         File storeFile = new File(fileName);
         if(storeFile.exists())
         {
          continue;
         }
      SaveFileThread runable=new SaveFileThread(dirStr2,downloadUrl,sem);
      pools.submit(runable);
      log.error("开始提交下载文件:"+downloadUrl);
      try
      {
       Thread.sleep(2500);
      } catch (InterruptedException e)
      {
      }

}

//啊哈哈，这个就是去拿下图片和动画的线程，结束

class SaveFileThread implements Runnable
{
  private String fileName;
  private String downloadUrl;
  private Semaphore sem;
          public SaveFileThread(String fileName,String downloadUrl,Semaphore sem)
          {
           this.fileName=fileName;
           this.downloadUrl=downloadUrl;
           this.sem=sem;
          }
  public void run()
  {
            HttpClient client = new HttpClient();
           GetMethod get = new GetMethod(downloadUrl);
           FileOutputStream output=null;
           try
     {
        client.executeMethod(get);
         File storeFile = new File(fileName);
             output = new FileOutputStream(storeFile);
             output.write(get.getResponseBody());
             output.flush();
     } catch (Exception e)
     {
      log.error(e.getMessage(), e);
     }

1
顶

1
踩

分享到：

MemCached缓存使用试验 | 用HtmlUnit和httpClient抓施华洛世奇网站图 ...

2010-11-24 23:27
浏览 2328
评论(0)
分类:编程语言
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论