View file sitemap/sitemap.class.php

File size: 6.79Kb
<?
class sitemap
{
	private $sitemap_urls = array();
	private $base;
	private $protocol;
	private $domain;
	private $check = array();
	private $proxy = "";
	
	//setting list of substring to ignore in urls
	public function set_ignore($ignore_list){
		$this->check = $ignore_list;
	}
	//set a proxy host and port (such as someproxy:8080 or 10.1.1.1:8080
	public function set_proxy($host_port){
		$this->proxy = $host_port;
	}
	//validating urls using list of substrings
	private function validate($url){
		$valid = true;
		//add substrings of url that you don't want to appear using set_ignore() method
		foreach($this->check as $val)
		{
			if(stripos($url, $val) !== false)
			{
				$valid = false;
				break;
			}
		}
		return $valid;
	}
	
	//multi curl requests
	private function multi_curl($urls){
		// for curl handlers
		$curl_handlers = array();
		//setting curl handlers
		foreach ($urls as $url) 
		{
			$curl = curl_init();
			curl_setopt($curl, CURLOPT_URL, $url);
			curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
			if (isset($this->proxy) && !$this->proxy == '') 
			{
				curl_setopt($curl, CURLOPT_PROXY, $this->proxy);
			}
			$curl_handlers[] = $curl;
		}
		//initiating multi handler
		$multi_curl_handler = curl_multi_init();
	
		// adding all the single handler to a multi handler
		foreach($curl_handlers as $key => $curl)
		{
			curl_multi_add_handle($multi_curl_handler,$curl);
		}
		
		// executing the multi handler
		do 
		{
			$multi_curl = curl_multi_exec($multi_curl_handler, $active);
		} 
		while ($multi_curl == CURLM_CALL_MULTI_PERFORM  || $active);
		
		foreach($curl_handlers as $curl)
		{
			//checking for errors
			if(curl_errno($curl) == CURLE_OK)
			{
				//if no error then getting content
				$content = curl_multi_getcontent($curl);
				//parsing content
				$this->parse_content($content);
			}
		}
		curl_multi_close($multi_curl_handler);
		return true;
	}

	//function to call
	public function get_links($domain){
		//getting base of domain url address
		$this->base = str_replace("http://", "", $domain);
		$this->base = str_replace("https://", "", $this->base);
		$host = explode("/", $this->base);
		$this->base = $host[0];
		//getting proper domain name and protocol
		$this->domain = trim($domain);
		if(strpos($this->domain, "http") !== 0)
		{
			$this->protocol = "http://";
			$this->domain = $this->protocol.$this->domain;
		}
		else
		{
			$protocol = explode("//", $domain);
			$this->protocol = $protocol[0]."//";
		}
		
		if(!in_array($this->domain, $this->sitemap_urls))
		{
			$this->sitemap_urls[] = $this->domain;
		}
		//requesting link content using curl
		$curl = curl_init();
		curl_setopt($curl, CURLOPT_URL, $this->domain);
		if (isset($this->proxy) && !$this->proxy == '') 
		{
			curl_setopt($curl, CURLOPT_PROXY, $this->proxy);
		}
		curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
		$page = curl_exec($curl);
		curl_close($curl);
		$this->parse_content($page);
	}
	
	//parses content and checks for URLs
	private function parse_content($page){
		//getting all links from href attributes
		preg_match_all("/<a[^>]*href\s*=\s*'([^']*)'|".
					'<a[^>]*href\s*=\s*"([^"]*)"'."/is", $page, $match);
		//storing new links
		$new_links = array();
		for($i = 1; $i < sizeof($match); $i++)
		{
			//walking through links
			foreach($match[$i] as $url)
			{
				//if doesn't start with http and is not empty
				if(strpos($url, "http") === false  && trim($url) !== "")
				{
					//checking if absolute path
					if($url[0] == "/") $url = substr($url, 1);
					//checking if relative path
					else if($url[0] == ".")
					{
						while($url[0] != "/")
						{
							$url = substr($url, 1);
						}
						$url = substr($url, 1);
					}
					//transforming to absolute url
					$url = $this->protocol.$this->base."/".$url;
				}
				//if new and not empty
				if(!in_array($url, $this->sitemap_urls) && trim($url) !== "")
				{
					//if valid url
					if($this->validate($url))
					{
						//checking if it is url from our domain
						if(strpos($url, "http://".$this->base) === 0 || strpos($url, "https://".$this->base) === 0)
						{
							//adding url to sitemap array
							$this->sitemap_urls[] = $url;
							//adding url to new link array
							$new_links[] = $url;
						}
					}
				}
			}
		}
		$this->multi_curl($new_links);
		return true;
	}

	//returns array of sitemap URLs
	public function get_array(){
		return $this->sitemap_urls;
	}

	//notifies services like google, bing, yahoo, ask and moreover about your site map update
	public function ping($sitemap_url, $title ="", $siteurl = ""){
		// for curl handlers
		$curl_handlers = array();
		
		$sitemap_url = trim($sitemap_url);
		if(strpos($sitemap_url, "http") !== 0)
		{
			$sitemap_url = "http://".$sitemap_url;
		}
		$site = explode("//", $sitemap_url);
		$start = $site[0];
		$site = explode("/", $site[1]);
		$middle = $site[0];
		if(trim($title) == "")
		{
			$title = $middle;
		}
		if(trim($siteurl) == "")
		{
			$siteurl = $start."//".$middle;
		}
		//urls to ping
		$urls[0] = "http://www.google.com/webmasters/tools/ping?sitemap=".urlencode($sitemap_url);
		$urls[1] = "http://www.bing.com/webmaster/ping.aspx?siteMap=".urlencode($sitemap_url);
		$urls[2] = "http://search.yahooapis.com/SiteExplorerService/V1/updateNotification".
				"?appid=YahooDemo&url=".urlencode($sitemap_url);
		$urls[3] = "http://submissions.ask.com/ping?sitemap=".urlencode($sitemap_url);
		$urls[4] = "http://rpc.weblogs.com/pingSiteForm?name=".urlencode($title).
				"&url=".urlencode($siteurl)."&changesURL=".urlencode($sitemap_url);
	
		//setting curl handlers
		foreach ($urls as $url) 
		{
			$curl = curl_init();
			curl_setopt($curl, CURLOPT_URL, $url);
			curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
			curl_setopt($curl, CURL_HTTP_VERSION_1_1, 1);
			$curl_handlers[] = $curl;
		}
		//initiating multi handler
		$multi_curl_handler = curl_multi_init();
	
		// adding all the single handler to a multi handler
		foreach($curl_handlers as $key => $curl)
		{
			curl_multi_add_handle($multi_curl_handler,$curl);
		}
		
		// executing the multi handler
		do 
		{
			$multi_curl = curl_multi_exec($multi_curl_handler, $active);
		} 
		while ($multi_curl == CURLM_CALL_MULTI_PERFORM  || $active);
		
		// check if there any error
		$submitted = true;
		foreach($curl_handlers as $key => $curl)
		{
			//you may use curl_multi_getcontent($curl); for getting content
			//and curl_error($curl); for getting errors
			if(curl_errno($curl) != CURLE_OK)
			{
				$submitted = false;
			}
		}
		curl_multi_close($multi_curl_handler);
		return $submitted;
	}
	
	//generates sitemap
	public function generate_sitemap(){
		$sitemap = new SimpleXMLElement('<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"></urlset>');
        foreach($this->sitemap_urls as $url) 
		{
            $url_tag = $sitemap->addChild("url");
            $url_tag->addChild("loc", htmlspecialchars($url));
		}
		return $sitemap->asXML();
	}
}
?>