Aflæs fil med cURL - Hjælp til OOP
Jeg har fundet en tutorial, som viser en kode, der ser ud til at passe fint til mit behov. Mit problem er, at jeg ikke rigtigt forstår koden, da der ikke er nogen dokumentation på de enkelte linjer. Jeg kan derfor ikke omdanne den til at aflæse den fil, jeg gerne vil have aflæst. Han har givet få enkle forklaringer, men det er desværre ikke tilstrækkeligt til, at jeg kan bruge koden.Hans tutorial findes her: http://www.bradino.com/php/php-screen-scraping/#comment-3059
Kan nogen hjælpe med at forklare koden?
Jeg har hovedsageligt brug for hjælp til at forstå Step 3.
På forhånd mange tak.
// Step 1 - Initialize the class and fetch the page:
include ('cScrape.php');
$scrape = new Scrape();
$url = 'http://www.newyork411.com/Ad_Agencies_Production_Companies/category-cid-50553.htm';
$scrape->fetch($url);
$data = $scrape->removeNewlines($scrape->result);
//Step 2 - find your anchor and get the chunk of html that contains what you want
$data = $scrape->fetchBetween('<table width="490" border="0" cellpadding="3"','</table>',$data,true);
$rows = $scrape->fetchAllBetween('<TR','</tr>',$data,true);
//Step 3 - parse out the individual values and print out the first record for demo
foreach ($rows as $id => $row){
$record = array();
$cells = $scrape->fetchAllBetween('<td','</td>',$row,true);
$record['company'] = strip_tags($cells[1]);
$url = 'http://www.newyork411.com' . $scrape->fetchBetween('<a href="','">',$cells[1],false);
$url = str_replace(' ','%20',$url);
$scrape->fetch($url);
$data2 = $scrape->removeNewlines($scrape->result);
$data2 = $scrape->fetchBetween('<div id="tabText">','</div>',$data2,true);
$data2 = $scrape->fetchAfter('</table>',$data2,false);
$details = explode('<br />',$data2);
$record['address'] = $details[0];
$location = explode(',',$details[1]);
$record['city'] = trim($location[0]);
$location = explode(' ',trim($location[1]));
$record['state'] = trim($location[0]);
$record['zip'] = trim($location[1]);
for($i=2; $i<=5; $i++){
$detail = trim($details[$i]);
if(substr($detail,0,6)=='Phone:') $record['phone'] = str_replace('Phone: ','',$detail);
else if(substr($detail,0,4)=='Fax:') $record['fax'] = str_replace('Fax: ','',$detail);
else if(substr($detail,0,4)=='Web:') $record['web'] = strip_tags(str_replace('Web: ','',$detail));
else if(substr($detail,0,6)=='Email:') $record['email'] = strip_tags(str_replace('Email: ','',$detail));
}
print_r($record);
die();
//Her er den inkluderede fil
<?php
class Scrape {
public $headers = array();
public $result;
public $error;
function __construct() {
return true;
}
function setHeader($header) {
$this->headers[] = $header;
}
function fetch($url, $data=''){
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FRESH_CONNECT,true);
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0)");
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_MAXREDIRS, 5);
curl_setopt($ch, CURLOPT_COOKIEFILE, 'C:\Users\BRADINO\Desktop\cookie.txt');
curl_setopt($ch, CURLOPT_COOKIEJAR, 'C:\Users\BRADINO\Desktop\cookie.txt');
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 0);
if (is_array($data) && count($data)>0){
curl_setopt($ch, CURLOPT_POST, true);
$params = http_build_query($data);
curl_setopt($ch, CURLOPT_POSTFIELDS, $params);
}
if (is_array($this->headers) && count($this->headers)>0){
curl_setopt($ch, CURLOPT_HTTPHEADER, $this->headers);
}
$this->result = curl_exec($ch);
$this->error = curl_error($ch);
curl_close($ch);
}
function fetchBefore($needle,$haystack,$include=false){
$included = strpos($haystack,$needle) + strlen($needle);
$excluded = strpos($haystack,$needle);
if ($included === false || $excluded === false) { return null; }
$length = ($include == true) ? $included : $excluded ;
$substring = substr($haystack, 0, $length);
return trim($substring);
}
function fetchAfter($needle,$haystack,$include=false){
$included = strpos($haystack,$needle);
$excluded = strpos($haystack,$needle) + strlen($needle);
if ($included === false || $excluded === false) { return null; }
$position = ($include == true) ? $included : $excluded ;
$substring = substr($haystack, $position, strlen($haystack) - $position);
return trim($substring);
}
function fetchBetween($needle1,$needle2,$haystack,$include=false){
$position = strpos($haystack,$needle1);
if ($position === false) { return null; }
if ($include == false) $position += strlen($needle1);
$position2 = strpos($haystack,$needle2,$position);
if ($position2 === false) { return null; }
if ($include == true) $position2 += strlen($needle2);
$length = $position2 - $position;
$substring = substr($haystack, $position, $length);
return trim($substring);
}
function fetchAllBetween($needle1,$needle2,$haystack,$include=false){
$matches = array();
$exp = "|{$needle1}(.*){$needle2}|U";
preg_match_all($exp,$haystack,$matches);
$i = ($include == true) ? 0 : 1 ;
return $matches[$i];
}
function removeNewlines($input){
return str_replace(array("\t","\n","\r","\x20\x20","\0","\x0B"), "", html_entity_decode($input));
}
function removeTags($input,$allowed=''){
return strip_tags($input,$allowed);
}
}
?>