先上代码 BaseSpider.java: importjava.io.BufferedReader;importjava.io.IOException;importjava.io.InputStre
先上代码
BaseSpider.java:
import java.io.BufferedReader;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import org.apache.commons.httpclient.Cookie;import org.apache.commons.httpclient.HttpClient;import org.apache.commons.httpclient.HttpException;import org.apache.commons.httpclient.HttpMethod;import org.apache.commons.httpclient.NameValuePair;import org.apache.commons.httpclient.URIException;import org.apache.commons.httpclient.cookie.CookiePolicy;import org.apache.commons.httpclient.methods.GetMethod;import org.apache.commons.httpclient.methods.PostMethod;import org.apache.commons.httpclient.params.HttpMethodParams;import org.apache.commons.httpclient.protocol.Protocol;import org.apache.commons.httpclient.protocol.ProtocolSocketFactory;public class BasicSpider { public HttpClient client = null; protected String lastUrl; private String ENCODING="UTF-8"; public String getENCODING() { return ENCODING; } public void setENCODING(String eNCODING) { ENCODING = eNCODING; } public HttpClient getClient() { return client; } public void setClient(HttpClient client) { this.client = client; } public void setHeaders(HttpMethod method) { method.setRequestHeader("Accept", "text/html,application/xhtml+xml,application/xml;"); method.setRequestHeader("Accept-Language", "zh-cn"); method .setRequestHeader( "User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3"); method.setRequestHeader("Accept-Charset", "UTF-8"); method.setRequestHeader("Keep-Alive", "300"); method.setRequestHeader("Connection", "Keep-Alive"); method.setRequestHeader("Cache-Control", "no-cache"); } public String doPost(String actionUrl, NameValuePair[] params, String referer, String encoding) throws HttpException, IOException { PostMethod method = new PostMethod(actionUrl); setHeaders(method); method.setRequestHeader("Referer", referer); method.setRequestHeader("Content-Type", "application/x-www-form-urlencoded"); method.setRequestBody(params); client.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY); client.getParams().setParameter("http.protocol.content-charset", encoding); client.getParams().setParameter("http.protocol.single-cookie-header", true); // logPostRequest(method); client.executeMethod(method); String responseStr = readInputStream(method.getResponseBodyAsStream(), encoding); method.releaseConnection(); System.out.println("responseStr:"+responseStr); if(responseStr.indexOf("HTTP-EQUIV=/"Refresh/"")!=-1){ String refreshPage =responseStr.substring(responseStr.indexOf("http:"), responseStr.indexOf("/">")); System.out.println("refreshPage:"+refreshPage); return doGet(refreshPage,"",encoding); } String welcomePage =""; if (method.getResponseHeader("Location") != null) { welcomePage = method.getResponseHeader("Location").getValue(); welcomePage = welcomePage.replaceAll("%3a", ":"); welcomePage = welcomePage.replaceAll("%3f", "?"); welcomePage = welcomePage.replaceAll("%26", "&"); if (method.getResponseHeader("Location").getValue().startsWith( "http")) { //System.out.println("welcomePage:"+welcomePage); return doGet(welcomePage,"", encoding); } else { //System.out.println("welcomePage2:"+"http://" + getResponseHost(method) + welcomePage); return doGet("http://" + getResponseHost(method) + welcomePage,"", encoding); } } else { //System.out.println("lastUrl:"+lastUrl); lastUrl = method.getURI().toString(); return responseStr; } } private String getResponseHost(PostMethod method) throws URIException { String url = method.getURI().toString(); return url.split("/")[2]; } protected String getJSRedirectLocation(String content) { String name = "window.location.replace(/""; int index = content.indexOf(name) + name.length(); content = content.substring(index); content = content.substring(0, content.indexOf("/"")); return content; } private String readInputStream(InputStream is) throws IOException { byte[] b = new byte[4096]; StringBuilder builder = new StringBuilder(); int bytesRead = 0; while (true) { bytesRead = is.read(b, 0, 4096); if (bytesRead == -1) { return builder.toString(); } builder.append(new String(b, 0, bytesRead, ENCODING)); } } private String readInputStream(InputStream is, String encoding) throws IOException{ StringBuffer temp = new StringBuffer(); BufferedReader buffer = new BufferedReader(new InputStreamReader(is,encoding)); for(String tempstr = ""; (tempstr = buffer.readLine()) != null;) temp = temp.append(tempstr+"/n"); buffer.close(); is.close(); String result = temp.toString().trim(); return result; } public String doGet(String url, String referer, String encoding) throws HttpException, IOException { GetMethod method = new GetMethod(url); setHeaders(method); method.setRequestHeader("Referer", referer); client.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY); client.getParams().setParameter("http.protocol.content-charset", encoding); client.getParams().setParameter("http.protocol.single-cookie-header", true); method.getParams().setParameter(HttpMethodParams.SO_TIMEOUT,15000); client.getHttpConnectionManager().getParams().setConnectionTimeout(15000); //logGetRequest(method); try{ client.executeMethod(method); String responseStr = readInputStream(method.getResponseBodyAsStream(), encoding); //logGetResponse(method, responseStr); //System.out.println("responseStr:"+responseStr); method.releaseConnection(); //lastUrl = method.getURI().toString(); return responseStr; }catch(Exception e){ return ""; } } }
一般的网站数据url中会有一些数字,表示页数
用get方式获取对应url的源代码,其中i为page number,此处为占位符;StringUtil.getEncodingGB2312()为编码格式,这个要看网站的编码格式是什么,一般GB2312/UTF-8
String res = spider.doGet(String.format(url, i), "", StringUtil.getEncodingGB2312());
获取网站源码之后,就是split、substring各种解析了。。。
下面是post方式,稍微复杂一点:
有些网站需要登录后,才能查看一些信息,所以我们需要模拟登陆
好,我们现在来举一个例子,只是一个例子啊,如果你是这个网站的负责人,不要来揍我,只是举一个例子。。。另外这也不是做广告(突然想起来)
http://www.cofeed.com/Login.asp 这是一个登陆界面
先注册一个,比如用户名,密码都是111111,然后用chrome打开 ,鼠标右击网页内任何一处,选择“审查元素”,
选择network,如下图,现在什么都没有
在登录界面输入用户名、密码之后,点击登录,发现已经有数据了
点击第一个
看到上面的Form Data了吗?表单提交,我们等会儿需要把数据都传过去
Cofeed_Name、coffeed_PWD、remember、act最好都传
接下来是post传参数的代码了 我这里少传了一个好像也没有问题,但是刚开始act没有传就没成功
BasicSpider spider = new BasicSpider(); spider.setClient(new HttpClient()); //form表单里面的都要传 NameValuePair params[] = { new NameValuePair("Cofeed_Name","111111"), new NameValuePair("cofeed_PWD","111111"), new NameValuePair("act", "LoginOk"), }; String content = spider .doPost("http://www.cofeed.com/WebUser/UserChckLogin.asp", params, "", StringUtil.getEncodingUTF_8());
上面的content就跟刚才的get一样,是网页的源代码,不过我们需要的数据不是这个登陆界面的
再get一次就好
String res = spider.doGet(String.format(url, i), "", StringUtil.getEncodingUTF_8());
接下来还是解析。。。
数据爬下来之后,可以保存到本地文件中,如txt文件,也可以直接插入到数据库。爬数据需要记录当前的page number,可能因为一些事情,暂时断掉,下次继续爬。另外有些网站可能有防爬机制,试试Thread.sleep(XXX),方法里面的时间可以是一个固定的,比如2秒(2000),也可以是随机的,比如4~6秒。要是不着急,可以定一个符合人类浏览网页的时间,这样就不会被发现,哈哈