Java 获得网页源代码和模拟浏览器请求(个人总结)
Java获取源代码自己知道的几种方式,在这里总结一下。1:GetSourceCode.javapackage kalision;import java.io.BufferedReader;import java.io.IOException;import java.io.InputStreamReader;import java.net.HttpURLC
·
Java获取源代码自己知道的几种方式,在这里总结一下。
1:GetSourceCode.java
package kalision;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
public class GetSourceCode {
public static void main(String[] args) throws IOException {
HttpURLConnection huc;
URL myurl = new URL("http://www.baidu.com"); //获取源码的页面。
huc = (HttpURLConnection) myurl.openConnection();
BufferedReader in;
in = new BufferedReader(new InputStreamReader(huc.getInputStream()));
String line;
while ((line = in.readLine()) != null) {
System.out.println(line);
}
}
}
或者
2.test1.java
package kalision;
import java.io.BufferedReader;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.net.URL;
import java.net.URLConnection;
public class test1 {
/**
* @param args
*/
public static void main(String[] args) {
try {
URL url = new URL("http://train.qunar.com/stationToStation.htm?fromStation=%E6%B5%8E%E5%8D%97&toStation=%E7%83%9F%E5%8F%B0&date=2012-01-08");
URLConnection conn = url.openConnection();
conn.setDoOutput(true);
InputStream in = null;
in = url.openStream();
String content = pipe(in,"utf-8");
System.out.println(content);
} catch (Exception e) {
e.printStackTrace();
}
}
static String pipe(InputStream in,String charset) throws IOException {
StringBuffer s = new StringBuffer();
if(charset==null||"".equals(charset)){
charset="utf-8";
}
String rLine = null;
BufferedReader bReader = new BufferedReader(new InputStreamReader(in,charset));
PrintWriter pw = null;
FileOutputStream fo = new FileOutputStream("../index.html");
OutputStreamWriter writer = new OutputStreamWriter(fo, "utf-8");
pw = new PrintWriter(writer);
while ( (rLine = bReader.readLine()) != null) {
String tmp_rLine = rLine;
int str_len = tmp_rLine.length();
if (str_len > 0) {
s.append(tmp_rLine);
pw.println(tmp_rLine);
pw.flush();
}
tmp_rLine = null;
}
in.close();
pw.close();
return s.toString();
}
}
如果得到的源文件保存运行,出现乱码。是因为编码问题。可以尝试修改
源文件头部的编码为GBK等即可。
以上两种方式都可以得到页面的源码。
对于有请求参数的页面如:
test1类中的url,它是一个请求连接,带有参数,以get方式提交的url
返回的源码可能没有我们想要的数据。
据个人了解这种页面大多数情况数据是放到了另一个页面。
在返回的源文件中以js动态去彼页面获取动态数据加载到此页面中。
可以用firebug等一些工具来抓到此页面。
来分析解析这些需要的动态数据。
当然个人感觉这样的工作,非推荐的。也是不易实现的。
对于上面讲到的get方式提交 ,可以直接在url后边添加参数。下面是以post方式提交数据并请求
1.Test.java
import java.util.Properties;
public class Test {
public static void testRequestPostStringByteArray() throws Exception {
Properties requestProperties = new Properties();
// 模拟浏览器信息
requestProperties
.put(
"User-Agent",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; TencentTraveler ; .NET CLR 1.1.4322)");
byte[] b = HtmlPost.requestPost("http://train.qunar.com/stationToStation.htm?fromStation=%E5%8C%97%E4%BA%AC&toStation=%E4%B8%8A%E6%B5%B7&date=2012-01-01",
"XML".getBytes());
System.err.println(new String(b, "utf-8"));
}
/**
* Test method for
* {@link org.zlex.commons.net.NetUtils#requestPostForm(java.lang.String, java.util.Properties)}
* .
*/
public static void testRequestPostForm() throws Exception {
Properties formProperties = new Properties();
formProperties.put("ictN", "5924");
formProperties.put("fdl", "");
formProperties.put("lx", "00");
formProperties.put("nyear3", "2011");
formProperties.put("nyear3_new_value", "true");
formProperties.put("nmonth3", "12");
formProperties.put("nmonth3_new_value", "true");
formProperties.put("nday3", "27");
formProperties.put("nday3_new_value", "false");
formProperties.put("startStation_ticketLeft", "6d4e53e80482a0b7");
formProperties.put("startStation_ticketLeft_new_value", "true");
formProperties.put("arriveStation_ticketLeft", "53174e1300e781a2");
formProperties.put("arriveStation_ticketLeft_new_value", "true");
formProperties.put("trainCode", "");
formProperties.put("trainCode_new_value", "true");
formProperties.put("rFlag", "1");
formProperties.put("name_ckball", "value_ckball");
formProperties.put("tFlagDC", "DC");
formProperties.put("tFlagZ", "Z");
formProperties.put("tFlagT", "T");
formProperties.put("tFlagK", "K");
formProperties.put("tFlagPK", "PK");
formProperties.put("tFlagPKE", "PKE");
formProperties.put("tFlagLK", "LK");
formProperties.put("randCode", "BYHJ");
byte[] b = HtmlPost.requestPostForm(
"http://dynamic.12306.cn/TrainQuery/iframeLeftTicketByStation.jsp",
formProperties);
// byte[] b = HtmlPost.requestPostForm(
// "http://train.qunar.com/stationToStation.htm?fromStation=%E5%8C%97%E4%BA%AC&toStation=%E5%B9%BF%E5%B7%9E&date=2011-12-31",
// formProperties);
//
System.err.println(new String(b, "utf-8"));
}
public static void main(String args[]){
try {
testRequestPostForm();
// testRequestPostStringByteArray();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
更多推荐


所有评论(0)