可以自定義的部分有:
請求方式(默認為Getuser-agent為谷歌瀏覽器的設置),可以通過實現RequestSet接口來自定義請求方式
儲存方式(默認儲存在f盤的html文件夾下),可以通過SaveUtil接口來自定義保存方式
需要保存的資源(默認為整個html頁面)
篩選方式(默認所有url都符合要求),通過實現ResourseChooser接口來自定義需要保存的url和資源頁面
實現的部分有:
html頁面的下載方式,通過HttpClient實現html頁面的下載
html頁面的解析部分,通過jsoup實現html頁面的解析
HtmlDownloader類,用于根據一個url下載一個html頁面
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
|
package DownloadPackage; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; /* * 根據一個url下載一個html頁面 */ public class HtmlDownloader { RequestSet requestset = null ; public HtmlDownloader(RequestSet requestset){ this .requestset = requestset; } public String downloadhtml(String url){ String html = null ; //創建一個客戶端 //創建一個讀取流從entity讀取html BufferedReader reader = null ; CloseableHttpClient httpclient = HttpClients.createDefault(); HttpResponse response = null ; try { response = httpclient.execute(requestset.getMethod(url)); HttpEntity entity = response.getEntity(); reader = new BufferedReader( new InputStreamReader(entity.getContent())); StringBuilder sb = new StringBuilder(); while ((html = reader.readLine()) != null ){ sb.append(html); } html = sb.toString(); System.out.println( "一個html頁面獲取成功" ); } catch (IOException e) { System.out.println(url+ "連接失敗" ); } finally { if (reader != null ){ try { reader.close(); httpclient.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } return html; } } |
UrlGet類,用于根據一個html頁面獲得所有的url連接
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
package DownloadPackage; import java.util.LinkedList; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class UrlGet { public LinkedList<String> geturls(String html){ LinkedList<String> urls = new LinkedList<String>(); Document doc = Jsoup.parse(html); Elements links = doc.getElementsByTag( "a" ); for (Element link:links){ String url = link.attr( "href" ); urls.add(url); } return urls; } } |
資源選擇接口,需要實現三個方法,第一是isNeed方法,判斷url是否為需要的,第二個是isResourse方法,判斷url頁面是不是需要的資源頁面,第三個是process方法,
有時網頁上的url是我們需要的但是格式不對,對url進行加工
1
2
3
4
5
6
|
package ChoosePackage; public interface ResourseChooser { public Boolean isNeed(String url); public Boolean isResourse(String url); public String process(String url); } |
RequsetSet類,用于自定義請求方法的接口,實現getMethod方法獲取請求方法
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
package DownloadPackage; import org.apache.http.client.methods.HttpGet; /* * 一個用于獲得Request請求的接口 * 實現getMethod方法獲取Get方法 */ public interface RequestSet { public HttpGet getMethod(String url); } Saveutil接口用于自定義保存方式,需要實現save方法 package SaveUtil; /* * 數據儲存的工具接口,必須實現保存方法 */ public interface SaveUtil { public void save(String url,String html); } |
Spider類,有五中構造方法,可以實現多種自定義操作,其中實現了上述自定義接口的默認實現類
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
|
package Spider; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.HttpGet; import ChoosePackage.MyResourseChooser; import ChoosePackage.ResourseChooser; import DownloadPackage.HtmlDownloader; import DownloadPackage.RequestSet; import DownloadPackage.UrlGet; import SaveUtil.MySaveUtil; import SaveUtil.SaveUtil; /* * 用于爬取資源的類 */ public class Spider{ public static void main(String[] args) { new Spider( "http://www.bilibili.net" ).spiderstart(); } //種子url String seed = null ; //用于保存數據的類,需要自己實現 private SaveUtil saveutil = null ; //html下載類 private HtmlDownloader downloader = null ; //url下載類 private UrlGet urldownloader = null ; //資源選擇工具 private ResourseChooser resoursechooser = null ; //用于保存未下載的網頁 LinkedList<String> unvisited = new LinkedList<String>(); //用于保存已下載的網頁 HashSet<String> visited = new HashSet<String>(); //自定義儲存方式,請求方式,資源篩選方式的構造方法 public Spider(SaveUtil saveutil,RequestSet request,ResourseChooser resoursechooser,String seed){ this .saveutil = saveutil; this .downloader = new HtmlDownloader(request); this .urldownloader = new UrlGet(); this .resoursechooser = resoursechooser; this .seed = seed; unvisited.add(seed); } //自定義儲存方式,資源篩選方式的構造方法 public Spider(SaveUtil saveutil,ResourseChooser resoursechooser,String seed){ this .resoursechooser = resoursechooser; this .downloader = new HtmlDownloader( new getRequest()); this .saveutil = saveutil; this .urldownloader = new UrlGet(); this .seed = seed; unvisited.add(seed); } //自定義儲存方式,請求的構造方法 public Spider(SaveUtil saveutil,RequestSet requestset,String seed){ this .saveutil = saveutil; this .downloader = new HtmlDownloader(requestset); this .resoursechooser = new MyResourseChooser(); this .urldownloader = new UrlGet(); this .seed = seed; unvisited.add(seed); } //自定義儲存方式的構造方法 public Spider(SaveUtil saveutil,String seed){ this .saveutil = saveutil; this .downloader = new HtmlDownloader( new getRequest()); this .resoursechooser = ( new MyResourseChooser()); this .urldownloader = new UrlGet(); this .seed = seed; unvisited.add(seed); } //默認的爬蟲構造方法 public Spider(String seed){ this .saveutil = new MySaveUtil(); this .downloader = new HtmlDownloader( new getRequest()); this .resoursechooser = ( new MyResourseChooser()); this .urldownloader = new UrlGet(); this .seed = seed; unvisited.add(seed); } //開始爬取的方法 private void spiderstart(){ String html = null ; while (!unvisited.isEmpty()){ String url = unvisited.poll(); System.out.println( "開始獲取" +url); if (resoursechooser.isNeed(url)){ try { html = downloader.downloadhtml(url); } catch (RuntimeException e){ System.out.println(url+ "連接獲取失敗" ); continue ; } visited.add(url); LinkedList<String> urls = new LinkedList<String>(); try { urls = urldownloader.geturls(html); } catch (RuntimeException e){ System.out.println(url+ "的html頁面為空" ); continue ; } Iterator<String> it = urls.iterator(); while (it.hasNext()){ String newurl = it.next(); if (resoursechooser.isNeed(newurl)&&!visited.contains(newurl)&&!unvisited.contains(newurl)){ newurl = resoursechooser.process(newurl); unvisited.add(newurl); System.out.println(newurl+ "加入頁面" ); } } System.out.println( "獲取了" +url+ "上的所有url" ); if (resoursechooser.isResourse(url)){ saveutil.save(url,html); } } } } //默認資源篩選類 private class MyResourseChooser implements ResourseChooser{ @Override public Boolean isNeed(String url) { // TODO Auto-generated method stub if (!url.startsWith( "/" )&&!url.startsWith( "http" )){ return false ; } return true ; } @Override public Boolean isResourse(String url) { // TODO Auto-generated method stub return true ; } @Override public String process(String url) { // TODO Auto-generated method stub if (!url.startsWith( "http" )){ url = seed+url; } return url; } } public class getRequest implements RequestSet{ public HttpGet getMethod(String url) { // TODO Auto-generated method stub //創建一個get請求方法 HttpGet getmethod = new HttpGet(url); //HttpHost proxy = new HttpHost("124.88.67.81",80);這里不設置代理IP //設置請求超時時間等 RequestConfig responseconfig = RequestConfig.custom().setConnectionRequestTimeout( 10000 ).setConnectTimeout( 10000 ).setSocketTimeout( 10000 ).build(); //設置請求頭,主要是user-agent getmethod.addHeader( "User-Agent" , "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36" ); //設置請求參數 getmethod.setConfig(responseconfig); return getmethod; } } //默認的存儲類 public class MySaveUtil implements SaveUtil{ @Override public void save(String url, String html) { // TODO Auto-generated method stub String filename = getfilename(url); BufferedWriter writer = null ; try { writer = new BufferedWriter( new FileWriter(filename)); writer.write(html); writer.flush(); System.out.println( "文件寫入成功" ); } catch (IOException e){ System.out.println( "文件寫入失敗" ); } finally { try { if (writer != null ) writer.close(); } catch (IOException e) { // TODO Auto-generated catch block System.out.println( "流關閉失敗" ); } } } private String getfilename(String url){ String fileparentpath = "f://html" ; File file = new File(fileparentpath); if (!file.exists()){ file.mkdir(); } int last = url.lastIndexOf( "." ); int first = url.indexOf( "." ); url = url.substring(first,last); url = url.replaceAll( "\\." , "" ); url = url.replaceAll( "/" , "" ); return fileparentpath+ "/" +url+ ".txt" ; } } } |
總結
以上就是本文關于分享一個簡單的java爬蟲框架的全部內容,希望對大家有所幫助。有什么問題可以隨時留言,小編會及時回復大家的。感謝朋友們對本站的支持!
原文鏈接:http://blog.csdn.net/qq_35488769/article/details/70591405