Articlebase.com scraping tutorial – part 3, getting full article

In the first part, I have shown how to get links under any category. In the second part, I have shown how to get links for any search term.  In this part, I will show how to fetch a full content.

Le’ts get the html.

$link = ‘artcile_base_article_link’;

$html = file_get_contents($links);

Now, create the objects.

$dom = new DOMDocument();
@$dom->loadHTML($html);
$xpath = new DOMXPath($dom);

Create an empty array to hold our findings

$result = array();

We will first get the Article Title

$elements = $xpath->query(“//div[@class=’article_pg’]//h1”);
$element = $elements->item(0);
$result[‘title’]=strip_tags($element->nodeValue);

We have received our article title. Now we will go for article body.

$elements =  $xpath->query(“//div[@class=’article_cnt KonaBody’]”);
$element = $elements->item(0);
$result[‘body’] = $dom->saveXML($element);

You may now clean the html’s if you want as it may contain some site specific attributes. I am not going to show that here.

Getting article’s author bio.

//get author bio
$result[‘author_bio’]=”;
$xpath = new DOMXPath($dom);
$elements = $xpath->query(“//div[@class=’author_details’]/p”);

for ($i = 0;  $i < $elements->length; $i++ ) {  //$paras->length
$element = $elements->item($i);
$result[‘author_bio’] .= $dom->saveXml($element);
}

Now, we got all required things of our article. You may now process it further as you need. The full code looks like:

function get_article($url);
$html = file_get_contents($url);
if(!$html) return false;

$result = array();
$dom = new DOMDocument();
@$dom->loadHTML($html);
$xpath = new DOMXPath($dom);

//get title
$elements =  $xpath->query(“//div[@class=’article_pg’]//h1”);
$element = $elements->item(0);
$result[‘title’]=strip_tags($element->nodeValue);

//get body
$elements =  $xpath->query(“//div[@class=’article_cnt KonaBody’]”);
$element = $elements->item(0);
$result[‘body’] = $dom->saveXML($element);

//get author bio
$result[‘author_bio’]=”;
$xpath = new DOMXPath($dom);
$elements = $xpath->query(“//div[@class=’author_details’]/p”);

for ($i = 0;  $i < $elements->length; $i++ ) {  //$paras->length
$element = $elements->item($i);
$result[‘author_bio’] .= $dom->saveXml($element);
}

return $result;
}