Create a tool that counts the number of words on a web page. For any given URL, the tool will return the number of words displayed by a browser on the webpage (i.e. everything below the body tag).
When first run, it displays a form with three input boxes (allowing the user to test up to three URLs). When the form is submitted, the page submits back to itself. uses CURL to get the source for the webpages, then prints each URL along with the word count.
//get source of the url
function getContent ($url) {
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_BINARYTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
$output = curl_exec($ch);
curl_close ($ch) ;
//return the source of the url
return ($output) ;
}
//remove <script> and <style> tags from the body
function removetag($mystring,$opentag,$closetag){
//get the number of time a given tag repeat
$count=substr_count($mystring ,$closetag);
for($i=0;$i<$count;$i++) {
//starting tag position
$pos1 = stripos($mystring, $opentag);
//ending tag position $pos2 = stripos($mystring, $closetag);
if($pos1!==false) {
$len=strlen ($closetag);
$a=($pos2+$len)-strlen($mystring);
//remove starting tag,ending tag and tag content
$mystring=substr_replace($mystring, ' ', $pos1 , $a);
}
}
return $mystring;
}
//get the body of the source from whole source
function getBody($string)
{
$pos1 = stripos($string,"<body");
$pos2 = stripos($string,"</body>");
$string=substr($string,$pos1,($pos2-$pos1));
return $string;
}
function formatString($erg){
//remove all html tags in the body except <style> and <script>
$erg=strip_tags($erg,"<script><style>");
$erg=removetag($erg,"<style","</style>");
$erg=removetag($erg,"<script","</script>");
$erg = html_entity_decode($erg);
$erg=htmlspecialchars_decode($erg);
return($erg);
}
function getWordcount($url){
$erg = getContent($url) ;
$erg=getBody($erg);
$erg=formatString($erg);
$erg=str_word_count($erg);
return($erg);
}
function validateUrl($url){
//if url is blank
if(trim($url)!=""){
//check if http:// exist and add if not.
$pos=stripos($url,"http://");
if($pos===false)
{
$url="http://".$url;
}
//check the url is valid
if (preg_match('^http(s)?://[a-z0-9-]+(.[a-z0-9-]+)*(:[0-9]+)?(/.*)?$i', $url)) {
return true;
}
else{
return false;
}
}
//if url is blank
else{
return false;
}
}
?>
All other functions call in function getWordcount. So to apply this code just call the getword count function and pas the url as the parameter.
Monday, June 9, 2008
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment