国产精品青草久久久久福利99,亚洲欧洲久久av,国产日韩久久免费影院

thinkphp實現自動采集功能的三種方法：

方法一：QueryList

個人感覺比較好用，采集詳情比較不錯的選擇，但是采集復雜一點的列表，不好用。具體使用：

thinkphp自動采集怎么實現

控制器示例：

public?function?index(){ ????//?使用采集類 ????//?使用手冊?：http://www.php.cn/php/php-QueryList3-thinkphp.html ????import('Org.QL.QueryList'); ????$url?=?"http://www.zyctd.com/gqqg/"; ????$reg?=?array(); ????$reg['title']?=?array('.sulist_title','text'); ????$reg['shuliang']?=?array('.su_li1','html'); ????$obj?=?new?QueryList($url,$reg); ????$data?=?$obj-&gt;jsonArr; ????//?foreach($data?as?$v){ ????//?????echo?"<br>".$v['title'].'___'.$v['shuliang']."<br>"; ????//?} ????p($data); }

相關推薦：《ThinkPHP教程》

方法二：simple_html_dom

這個方法比較適合采集一點結構簡單的頁面，HTML標簽的類名比較明確的頁面，還不錯。具體使用：

thinkphp自動采集怎么實現

控制器示例：

public?function?index(){ ????//?參考文檔：http://microphp.us/plugins/public/microphp_res/simple_html_dom/manual.htm#section_quickstart ????//?下載地址：https://github.com/samacs/simple_html_dom/edit/master/simple_html_dom.php ????//?使用方法：http://www.thinkphp.cn/topic/21635.html ????import("Org.Util.simple_html_dom",?'',?'.php'); ????$html?=?file_get_html('http://www.zyctd.com/gqqg/'); ????$ret?=?$html-&gt;find('.supply_list_box?ul',0)-&gt;first_child(); ????foreach($ret?as?$v){ ????????echo?$v; ????}; }

方法三：獲取頁面HTMl，進行正則匹配采集

舉例一個Demo：

采集一個頁面：

http://www.zyctd.com/gqqg/

我要獲取上面的四個信息：標題，數量，時間，跳轉鏈接。

thinkphp自動采集怎么實現

獲取這些信息，通過上面兩種方法都采集不到，最后才選用的正則來采集。具體方法：

public?function?index(){ ????$url?=?"http://www.zyctd.com/gqqg/"; ????//?http://www.zyctd.com/gqqg-p1.html ????$supplyDB?=?M('supply');???? ????$urlList?=?array(); ????$array?=?array(); ????for($x=1;?$xgetInfo($v); ????????array_push($array,$curPageList); ????}; ????foreach($array?as?$v){ ????????foreach($v?as?$vv){ ????????????//echo?$vv['title']."__".$vv['weight']."__".$vv['time']."<br>"; ????????????$data?=?array(); ????????????$data['title']?=?$vv['title']; ????????????$data['weight']?=?$vv['weight']; ????????????$data['add_time']?=?$vv['add_time']; ????????????$data['url']?=?$vv['url']; ????????????//$res?=?$supplyDB-&gt;add($data); ????????????//echo?$res; ????????????echo?"<p><span>".$vv['title']."</span> ????????????<span>".$vv['weight']."</span> ????????????<span>".$vv['add_time']."</span> ????????????<span>".$vv['url']."</span></p>"; ????????} ????} ????????//?獲取信息 ????????//$curPageList?=?$this-&gt;getInfo($html); ????????//p($curPageList); } private?function?getInfo($url){ ????$html?=?$this-&gt;getHtml($url); ????$array?=?array(); ????//?匹配所有的標題 ????preg_match_all("#<divclass><i></i><span>(.*?)</span>#",$html,$matches); ????$all_title?=?$matches[1]; ????preg_match_all("#<i>發布時間：</i><span>(.*?)</span>#",$html,$matches); ????//?匹配所有的發布時間 ????$all_time?=?$matches[1]; ????//?匹配所有的求購數量 ????preg_match_all("#<i>求購數量：</i><span>(.*?)</span>#",$html,$matches); ????$all_weight?=?$matches[1]; ????//?匹配跳轉鏈接 ????preg_match_all("#<atarget>#",$html,$matches); ????$all_url?=?$matches[1]; ????//?組合 ????foreach($all_title?as?$k?=&gt;?$v){ ????????$arr?=?array(); ????????$arr['title']?=?$v; ????????$arr['weight']?=?$all_weight[$k]; ????????$arr['add_time']?=?$all_time[$k]; ????????$arr['url']?=?$all_url[$k]; ????????array_push($array,$arr); ????} ????return?$array; } private?function?getHtml($url){ ????$html?=?file_get_contents($url); ????$html?=?preg_replace("# #","",$html); ????$html?=?preg_replace("# #","",$html); ????$html?=?preg_replace("#s#","",$html); ????return?$html; }</atarget></divclass>

以上就是

文章版權歸作者所有，未經允許請勿轉載。

THE END

喜歡就支持一下吧

點贊7