爬虫技术之 htmlunit 使用入门
<dependency> <groupId>net.sourceforge.htmlunit</groupId> <artifactId>htmlunit</artifactId> <version>2.31</version> </dependency>
//获取页面 WebClient webClient=new WebClient(); //是否开启js渲染 webClient.getOptions().setJavaScriptEnabled(true); HtmlPage page=null; try { page=webClient.getPage("https://mp.csdn.net/"); //等待页面渲染完成 Thread.sleep(3000); //控制台打印出页面 System.out.println(page.asXml()); } catch (Exception e) { e.printStackTrace(); }
3.2 一些设置
//是否开启css渲染 webClient.getOptions().setCssEnabled(false); //是否开启js渲染 webClient.getOptions().setJavaScriptEnabled(true); //是否允许所有人链接(解决https证书不信任问题) webClient.getOptions().setUseInsecureSSL(true); //js失败是否抛出异常 webClient.getOptions().setThrowExceptionOnScriptError(false); //是否启用重定向 webClient.getOptions().setRedirectEnabled(true);
3.3 执行页面js
//执行页面js,并获得结果,获取页面中变量_hmt的值 ScriptResult t=page.executeJavaScript("_hmt "); System.out.println(t.getJavaScriptResult().toString());
3.4操作dom树,并触发相关事件
//获取元素 类似js语法的操作方式 DomElement domElement= page.getElementById("feedlist_id"); try { //触发单击事件,获得新的页面 HtmlPage page1= domElement.click(); } catch (IOException e) { e.printStackTrace(); }
//创建httpclient的客户端 CookieStore cookieStore = new BasicCookieStore(); CloseableHttpClient httpClient = HttpClients.custom() .setDefaultCookieStore(cookieStore) .build(); //获取htmlunit cookie; Set<Cookie> htmlUnitCookies= webClient.getCookieManager().getCookies(); //将htmlunit cookie 转换成htmlclient cookie for(Cookie cookie:htmlUnitCookies){ cookieStore.addCookie(new BasicClientCookie(cookie.getName(),cookie.getValue())); } //获取htmlclient cookie List<org.apache.http.cookie.Cookie> httpClientCookies= cookieStore.getCookies(); //cookie 转换 for(org.apache.http.cookie.Cookie cookie:httpClientCookies){ webClient.getCookieManager().addCookie(new Cookie(cookie.getDomain(),cookie.getName(),cookie.getValue())); }
引用原文:https://blog.csdn.net/qq_34661726/article/details/80641474