被公侵犯人妻一区二区三区,日韩亚洲欧美中文三级

當前位置：首頁 > 范文|應用文 > IT技術(shù)專欄 > 網(wǎng)絡編程

php提取網(wǎng)頁正文內(nèi)容的例子

來源：易賢網(wǎng) 閱讀：742 次日期：2015-01-07 15:07:08

溫馨提示：易賢網(wǎng)小編為您整理了“php提取網(wǎng)頁正文內(nèi)容的例子”,方便廣大網(wǎng)友查閱！

因為難點在于如何去識別并保留網(wǎng)頁中的文章部分，而且刪除其它無用的信息，并且要做到通用化，不能像火車頭那樣根據(jù)目標站來制定采集規(guī)則，因為搜索引擎結(jié)果中有各種的網(wǎng)頁。

抓回一個頁面的數(shù)據(jù)，如何匹配出正文部分，鄭曉在下班路上想了個思路是：

1. 提取出body標簽部分–>剔除所有鏈接–>剔除所有script、注釋–>剔除所有空白標簽(包括標簽內(nèi)不含中文的)–>獲取結(jié)果。

2. 直接匹配出非鏈接的、符合在div、p、h標簽中的中文部分???

還是會有不少其它多余信息啊，比如底部信息等。。如何搞?不知道大家有木有什么思路或建議?

這個類是從網(wǎng)上找到的一個php實現(xiàn)的提取網(wǎng)頁正文部分的算法，鄭曉在本地也測試了下，準確率非常高。

代碼如下:

<?php

class readability {

// 保存判定結(jié)果的標記位名稱

const attr_content_score = contentscore;

// dom 解析類目前只支持 utf-8 編碼

const dom_default_charset = utf-8;

// 當判定失敗時顯示的內(nèi)容

const message_can_not_get = readability was unable to parse this page for content.;

// dom 解析類（php5 已內(nèi)置）

protected $dom = null;

// 需要解析的源代碼

protected $source = ;

// 章節(jié)的父元素列表

private $parentnodes = array();

// 需要刪除的標簽

// note: added extra tags from

private $junktags = array(style, form, iframe, script, button, input, textarea,

noscript, select, option, object, applet, basefont,

bgsound, blink, canvas, command, menu, nav, datalist,

embed, frame, frameset, keygen, label, marquee, link);

// 需要刪除的屬性

private $junkattrs = array(style, class, onclick, onmouseover, align, border, margin);

/**

* 構(gòu)造函數(shù)

* @param $input_char 字符串的編碼。默認 utf-8，可以省略

function __construct($source, $input_char = utf-8) {

$this->source = $source;

// dom 解析類只能處理 utf-8 格式的字符

$source = mb_convert_encoding($source, 'html-entities', $input_char);

// 預處理 html 標簽，剔除冗余的標簽等

$source = $this->preparsource($source);

// 生成 dom 解析類

$this->dom = new domdocument('1.0', $input_char);

try {

//libxml_use_internal_errors(true);

// 會有些錯誤信息，不過不要緊 :^)

if (encoding='.readability::dom_default_charset.'>'.$source)) {

throw new exception(parse html error!);

}

foreach ($this->dom->childnodes as $item) {

if ($item->nodetype == xml_pi_node) {

$this->dom->removechild($item); // remove hack

}

// insert proper

$this->dom->encoding = readability::dom_default_charset;

} catch (exception $e) {

// ...

}

/**

* 預處理 html 標簽，使其能夠準確被 dom 解析類處理

* @return string

private function preparsource($string) {

// 剔除多余的 html 編碼標記，避免解析出錯

preg_match(/charset=([＼w|＼-]+);?/, $string, $match);

if (isset($match[1])) {

$string = preg_replace(/charset=([＼w|＼-]+);?/, , $string, 1);

}

// replace all doubled-up <br> tags with <p> tags, and remove fonts.

$string = preg_replace(/<br＼/?>[ ＼r＼n＼s]*<br＼/?>/i, </p><p>, $string);

$string = preg_replace(/<＼/?font[^>]*>/i, , $string);

// @see

// - from

$string = preg_replace(#<script(.*?)>(.*?)</script>#is, , $string);

return trim($string);

}

/**

* 刪除 dom 元素中所有的 $tagname 標簽

* @return domdocument

private function removejunktag($rootnode, $tagname) {

$tags = $rootnode->getelementsbytagname($tagname);

//note: always index 0, because removing a tag removes it from the results as well.

while($tag = $tags->item(0)){

$parentnode = $tag->parentnode;

$parentnode->removechild($tag);

}

return $rootnode;

}

/**

* 刪除元素中所有不需要的屬性

private function removejunkattr($rootnode, $attr) {

$tags = $rootnode->getelementsbytagname(*);

$i = 0;

while($tag = $tags->item($i++)) {

$tag->removeattribute($attr);

}

return $rootnode;

}

/**

* 根據(jù)評分獲取頁面主要內(nèi)容的盒模型

* 判定算法來自：

* 這里由鄭曉博客轉(zhuǎn)發(fā)

* @return domnode

private function gettopbox() {

// 獲得頁面所有的章節(jié)

$allparagraphs = $this->dom->getelementsbytagname(p);

// study all the paragraphs and find the chunk that has the best score.

// a score is determined by things like: number of <p>'s, commas, special classes, etc.

$i = 0;

while($paragraph = $allparagraphs->item($i++)) {

$parentnode = $paragraph->parentnode;

$contentscore = intval($parentnode->getattribute(readability::attr_content_score));

$classname = $parentnode->getattribute(class);

$id = $parentnode->getattribute(id);

// look for a special classname

if (preg_match(/(comment|meta|footer|footnote)/i, $classname)) {

$contentscore -= 50;

} else if(preg_match(

$classname)) {

$contentscore += 25;

}

// look for a special id

if (preg_match(/(comment|meta|footer|footnote)/i, $id)) {

$contentscore -= 50;

} else if (preg_match(

$id)) {

$contentscore += 25;

}

// add a point for the paragraph found

// add points for any commas within this paragraph

if (strlen($paragraph->nodevalue) > 10) {

$contentscore += strlen($paragraph->nodevalue);

}

// 保存父元素的判定得分

$parentnode->setattribute(readability::attr_content_score, $contentscore);

// 保存章節(jié)的父元素，以便下次快速獲取

array_push($this->parentnodes, $parentnode);

}

$topbox = null;

// assignment from index for performance.

// see

for ($i = 0, $len = sizeof($this->parentnodes); $i < $len; $i++) {

$parentnode = $this->parentnodes[$i];

$contentscore = intval($parentnode->getattribute(readability::attr_content_score));

$orgcontentscore = intval($topbox ? $topbox->getattribute(readability::attr_content_score) : 0);

if ($contentscore && $contentscore > $orgcontentscore) {

$topbox = $parentnode;

}

// 此時，$topbox 應為已經(jīng)判定后的頁面內(nèi)容主元素

return $topbox;

}

/**

* 獲取 html 頁面標題

* @return string

public function gettitle() {

$split_point = ' - ';

$titlenodes = $this->dom->getelementsbytagname(title);

if ($titlenodes->length

&& $titlenode = $titlenodes->item(0)) {

// @see

$title = trim($titlenode->nodevalue);

$result = array_map('strrev', explode($split_point, strrev($title)));

return sizeof($result) > 1 ? array_pop($result) : $title;

}

return null;

}

/**

* get leading image url

* @return string

public function getleadimageurl($node) {

$images = $node->getelementsbytagname(img);

if ($images->length && $leadimage = $images->item(0)) {

return $leadimage->getattribute(src);

}

return null;

}

/**

* 獲取頁面的主要內(nèi)容（readability 以后的內(nèi)容）

* @return array

public function getcontent() {

if (!$this->dom) return false;

// 獲取頁面標題

$contenttitle = $this->gettitle();

// 獲取頁面主內(nèi)容

$contentbox = $this->gettopbox();

//check if we found a suitable top-box.

if($contentbox === null)

throw new runtimeexception(readability::message_can_not_get);

// 復制內(nèi)容到新的 domdocument

$target = new domdocument;

$target->appendchild($target->importnode($contentbox, true));

// 刪除不需要的標簽

foreach ($this->junktags as $tag) {

$target = $this->removejunktag($target, $tag);

}

// 刪除不需要的屬性

foreach ($this->junkattrs as $attr) {

$target = $this->removejunkattr($target, $attr);

}

$content = mb_convert_encoding($target->savehtml(), readability::dom_default_charset, html-entities);

// 多個數(shù)據(jù)，以數(shù)組的形式返回

return array(

'lead_image_url' => $this->getleadimageurl($target),

'word_count' => mb_strlen(strip_tags($content), readability::dom_default_charset),

'title' => $contenttitle ? $contenttitle : null,

'content' => $content

);

}

function __destruct() { }

}

使用起來也非常簡單，實例化時傳入網(wǎng)頁的html源碼和相應的編碼，然后直接調(diào)用其getcontent方法即可返回提取到的正文部分，提取出的文章中可能還會含有少部分鏈接，可以自己后期再修改

更多信息請查看IT技術(shù)專欄

更多信息請查看網(wǎng)絡編程

上一篇：asp.net通過動態(tài)加載不同css實現(xiàn)多界面

下一篇：ado.net執(zhí)行oracle 存儲過程

易賢網(wǎng)手機網(wǎng)站地址：php提取網(wǎng)頁正文內(nèi)容的例子

由于各方面情況的不斷調(diào)整與變化，易賢網(wǎng)提供的所有考試信息和咨詢回復僅供參考，敬請考生以權(quán)威部門公布的正式信息和咨詢?yōu)闇剩?/div>

相關(guān)閱讀網(wǎng)絡編程

Shell中如何刪除文本比較長的行的實現(xiàn)方法10月30日

vue.js語法及常用指令10月30日

python 讀寫中文json的實例詳解10月30日

Objective-C Json 實例詳解10月30日

bootstrap table sum總數(shù)量統(tǒng)計實現(xiàn)方法10月30日

python生成二維碼的實例詳解10月30日

Python批量更改文件名的實現(xiàn)方法10月30日

解決出現(xiàn)Incorrect integer value的問題10月30日

jQuery實現(xiàn)切換隱藏與顯示同時切換圖標功能10月30日

docker python api 安裝配置的詳解10月30日

javascript按鈕禁用和啟用的效果實例代碼10月30日

vue.js todolist實現(xiàn)代碼10月30日

vue.js 父向子組件傳參的實例代碼10月30日

apache 開啟重定向 rewrite的實現(xiàn)方法10月30日

Vue.js劃分組件的方法10月30日

python logging日志模塊的詳解10月30日

vue中的scope使用詳解10月30日

docker cgroup 資源監(jiān)控的詳解10月30日

使用Android Studio 開發(fā)自己的SDK教程10月23日

linux系統(tǒng)下MongoDB單節(jié)點安裝教程10月23日

易賢網(wǎng)移動網(wǎng)站

2025國考·省考課程試聽報名

報班類型
姓名
手機號
驗證碼