00001 <?php 00002 /* 00003 * Project: MagpieRSS: a simple RSS integration tool 00004 * File: rss_fetch.inc, a simple functional interface 00005 to fetching and parsing RSS files, via the 00006 function fetch_rss() 00007 * Author: Kellan Elliott-McCrea <kellan@protest.net> 00008 * License: GPL 00009 * 00010 * The lastest version of MagpieRSS can be obtained from: 00011 * http://magpierss.sourceforge.net 00012 * 00013 * For questions, help, comments, discussion, etc., please join the 00014 * Magpie mailing list: 00015 * magpierss-general@lists.sourceforge.net 00016 * 00017 */ 00018 00019 // Setup MAGPIE_DIR for use on hosts that don't include 00020 // the current path in include_path. 00021 // with thanks to rajiv and smarty 00022 if (!defined('DIR_SEP')) { 00023 define('DIR_SEP', DIRECTORY_SEPARATOR); 00024 } 00025 00026 if (!defined('MAGPIE_DIR')) { 00027 define('MAGPIE_DIR', dirname(__FILE__) . DIR_SEP); 00028 } 00029 00030 require_once( MAGPIE_DIR . 'rss_parse.inc' ); 00031 require_once( MAGPIE_DIR . 'rss_cache.inc' ); 00032 00033 // for including 3rd party libraries 00034 define('MAGPIE_EXTLIB', MAGPIE_DIR . 'extlib' . DIR_SEP); 00035 require_once( MAGPIE_EXTLIB . 'Snoopy.class.inc'); 00036 00037 00038 /* 00039 * CONSTANTS - redefine these in your script to change the 00040 * behaviour of fetch_rss() currently, most options effect the cache 00041 * 00042 * MAGPIE_CACHE_ON - Should Magpie cache parsed RSS objects? 00043 * For me a built in cache was essential to creating a "PHP-like" 00044 * feel to Magpie, see rss_cache.inc for rationale 00045 * 00046 * 00047 * MAGPIE_CACHE_DIR - Where should Magpie cache parsed RSS objects? 00048 * This should be a location that the webserver can write to. If this 00049 * directory does not already exist Mapie will try to be smart and create 00050 * it. This will often fail for permissions reasons. 00051 * 00052 * 00053 * MAGPIE_CACHE_AGE - How long to store cached RSS objects? In seconds. 00054 * 00055 * 00056 * MAGPIE_CACHE_FRESH_ONLY - If remote fetch fails, throw error 00057 * instead of returning stale object? 00058 * 00059 * MAGPIE_DEBUG - Display debugging notices? 00060 * 00061 */ 00062 00063 00064 /*=======================================================================*\ 00065 Function: fetch_rss: 00066 Purpose: return RSS object for the give url 00067 maintain the cache 00068 Input: url of RSS file 00069 Output: parsed RSS object (see rss_parse.inc) 00070 00071 NOTES ON CACHEING: 00072 If caching is on (MAGPIE_CACHE_ON) fetch_rss will first check the cache. 00073 00074 NOTES ON RETRIEVING REMOTE FILES: 00075 If conditional gets are on (MAGPIE_CONDITIONAL_GET_ON) fetch_rss will 00076 return a cached object, and touch the cache object upon recieving a 00077 304. 00078 00079 NOTES ON FAILED REQUESTS: 00080 If there is an HTTP error while fetching an RSS object, the cached 00081 version will be return, if it exists (and if MAGPIE_CACHE_FRESH_ONLY is off) 00082 \*=======================================================================*/ 00083 00084 define('MAGPIE_VERSION', '0.72'); 00085 00086 $MAGPIE_ERROR = ""; 00087 00088 function fetch_rss ($url) { 00089 // initialize constants 00090 init(); 00091 00092 if ( !isset($url) ) { 00093 error("fetch_rss called without a url"); 00094 return false; 00095 } 00096 00097 // if cache is disabled 00098 if ( !MAGPIE_CACHE_ON ) { 00099 // fetch file, and parse it 00100 $resp = _fetch_remote_file( $url ); 00101 if ( is_success( $resp->status ) ) { 00102 return _response_to_rss( $resp ); 00103 } 00104 else { 00105 error("Failed to fetch $url and cache is off"); 00106 return false; 00107 } 00108 } 00109 // else cache is ON 00110 else { 00111 // Flow 00112 // 1. check cache 00113 // 2. if there is a hit, make sure its fresh 00114 // 3. if cached obj fails freshness check, fetch remote 00115 // 4. if remote fails, return stale object, or error 00116 00117 $cache = new RSSCache( MAGPIE_CACHE_DIR, MAGPIE_CACHE_AGE ); 00118 00119 if (MAGPIE_DEBUG and $cache->ERROR) { 00120 debug($cache->ERROR, E_USER_WARNING); 00121 } 00122 00123 00124 $cache_status = 0; // response of check_cache 00125 $request_headers = array(); // HTTP headers to send with fetch 00126 $rss = 0; // parsed RSS object 00127 $errormsg = 0; // errors, if any 00128 00129 // store parsed XML by desired output encoding 00130 // as character munging happens at parse time 00131 $cache_key = $url . MAGPIE_OUTPUT_ENCODING; 00132 00133 if (!$cache->ERROR) { 00134 // return cache HIT, MISS, or STALE 00135 $cache_status = $cache->check_cache( $cache_key); 00136 } 00137 00138 // if object cached, and cache is fresh, return cached obj 00139 if ( $cache_status == 'HIT' ) { 00140 $rss = $cache->get( $cache_key ); 00141 if ( isset($rss) and $rss ) { 00142 // should be cache age 00143 $rss->from_cache = 1; 00144 if ( MAGPIE_DEBUG > 1) { 00145 debug("MagpieRSS: Cache HIT", E_USER_NOTICE); 00146 } 00147 return $rss; 00148 } 00149 } 00150 00151 // else attempt a conditional get 00152 00153 // setup headers 00154 if ( $cache_status == 'STALE' ) { 00155 $rss = $cache->get( $cache_key ); 00156 if ( $rss and $rss->etag and $rss->last_modified ) { 00157 $request_headers['If-None-Match'] = $rss->etag; 00158 $request_headers['If-Last-Modified'] = $rss->last_modified; 00159 } 00160 } 00161 00162 $resp = _fetch_remote_file( $url, $request_headers ); 00163 00164 if (isset($resp) and $resp) { 00165 if ($resp->status == '304' ) { 00166 // we have the most current copy 00167 if ( MAGPIE_DEBUG > 1) { 00168 debug("Got 304 for $url"); 00169 } 00170 // reset cache on 304 (at minutillo insistent prodding) 00171 $cache->set($cache_key, $rss); 00172 return $rss; 00173 } 00174 elseif ( is_success( $resp->status ) ) { 00175 $rss = _response_to_rss( $resp ); 00176 if ( $rss ) { 00177 if (MAGPIE_DEBUG > 1) { 00178 debug("Fetch successful"); 00179 } 00180 // add object to cache 00181 $cache->set( $cache_key, $rss ); 00182 return $rss; 00183 } 00184 } 00185 else { 00186 $errormsg = "Failed to fetch $url "; 00187 if ( $resp->status == '-100' ) { 00188 $errormsg .= "(Request timed out after " . MAGPIE_FETCH_TIME_OUT . " seconds)"; 00189 } 00190 elseif ( $resp->error ) { 00191 # compensate for Snoopy's annoying habbit to tacking 00192 # on '\n' 00193 $http_error = substr($resp->error, 0, -2); 00194 $errormsg .= "(HTTP Error: $http_error)"; 00195 } 00196 else { 00197 $errormsg .= "(HTTP Response: " . $resp->response_code .')'; 00198 } 00199 } 00200 } 00201 else { 00202 $errormsg = "Unable to retrieve RSS file for unknown reasons."; 00203 } 00204 00205 // else fetch failed 00206 00207 // attempt to return cached object 00208 if ($rss) { 00209 if ( MAGPIE_DEBUG ) { 00210 debug("Returning STALE object for $url"); 00211 } 00212 return $rss; 00213 } 00214 00215 // else we totally failed 00216 error( $errormsg ); 00217 00218 return false; 00219 00220 } // end if ( !MAGPIE_CACHE_ON ) { 00221 } // end fetch_rss() 00222 00223 /*=======================================================================*\ 00224 Function: error 00225 Purpose: set MAGPIE_ERROR, and trigger error 00226 \*=======================================================================*/ 00227 00228 function error ($errormsg, $lvl=E_USER_WARNING) { 00229 global $MAGPIE_ERROR; 00230 00231 // append PHP's error message if track_errors enabled 00232 if ( isset($php_errormsg) ) { 00233 $errormsg .= " ($php_errormsg)"; 00234 } 00235 if ( $errormsg ) { 00236 $errormsg = "MagpieRSS: $errormsg"; 00237 $MAGPIE_ERROR = $errormsg; 00238 trigger_error( $errormsg, $lvl); 00239 } 00240 } 00241 00242 function debug ($debugmsg, $lvl=E_USER_NOTICE) { 00243 trigger_error("MagpieRSS [debug] $debugmsg", $lvl); 00244 } 00245 00246 /*=======================================================================*\ 00247 Function: magpie_error 00248 Purpose: accessor for the magpie error variable 00249 \*=======================================================================*/ 00250 function magpie_error ($errormsg="") { 00251 global $MAGPIE_ERROR; 00252 00253 if ( isset($errormsg) and $errormsg ) { 00254 $MAGPIE_ERROR = $errormsg; 00255 } 00256 00257 return $MAGPIE_ERROR; 00258 } 00259 00260 /*=======================================================================*\ 00261 Function: _fetch_remote_file 00262 Purpose: retrieve an arbitrary remote file 00263 Input: url of the remote file 00264 headers to send along with the request (optional) 00265 Output: an HTTP response object (see Snoopy.class.inc) 00266 \*=======================================================================*/ 00267 function _fetch_remote_file ($url, $headers = "" ) { 00268 // Snoopy is an HTTP client in PHP 00269 $client = new Snoopy(); 00270 $client->agent = MAGPIE_USER_AGENT; 00271 $client->read_timeout = MAGPIE_FETCH_TIME_OUT; 00272 $client->use_gzip = MAGPIE_USE_GZIP; 00273 if (is_array($headers) ) { 00274 $client->rawheaders = $headers; 00275 } 00276 00277 @$client->fetch($url); 00278 return $client; 00279 00280 } 00281 00282 /*=======================================================================*\ 00283 Function: _response_to_rss 00284 Purpose: parse an HTTP response object into an RSS object 00285 Input: an HTTP response object (see Snoopy) 00286 Output: parsed RSS object (see rss_parse) 00287 \*=======================================================================*/ 00288 function _response_to_rss ($resp) { 00289 $rss = new MagpieRSS( $resp->results, MAGPIE_OUTPUT_ENCODING, MAGPIE_INPUT_ENCODING, MAGPIE_DETECT_ENCODING ); 00290 00291 // if RSS parsed successfully 00292 if ( $rss and !$rss->ERROR) { 00293 00294 // find Etag, and Last-Modified 00295 foreach($resp->headers as $h) { 00296 // 2003-03-02 - Nicola Asuni (www.tecnick.com) - fixed bug "Undefined offset: 1" 00297 if (strpos($h, ": ")) { 00298 list($field, $val) = explode(": ", $h, 2); 00299 } 00300 else { 00301 $field = $h; 00302 $val = ""; 00303 } 00304 00305 if ( $field == 'ETag' ) { 00306 $rss->etag = $val; 00307 } 00308 00309 if ( $field == 'Last-Modified' ) { 00310 $rss->last_modified = $val; 00311 } 00312 } 00313 00314 return $rss; 00315 } // else construct error message 00316 else { 00317 $errormsg = "Failed to parse RSS file."; 00318 00319 if ($rss) { 00320 $errormsg .= " (" . $rss->ERROR . ")"; 00321 } 00322 error($errormsg); 00323 00324 return false; 00325 } // end if ($rss and !$rss->error) 00326 } 00327 00328 /*=======================================================================*\ 00329 Function: init 00330 Purpose: setup constants with default values 00331 check for user overrides 00332 \*=======================================================================*/ 00333 function init () { 00334 if ( defined('MAGPIE_INITALIZED') ) { 00335 return; 00336 } 00337 else { 00338 define('MAGPIE_INITALIZED', true); 00339 } 00340 00341 if ( !defined('MAGPIE_CACHE_ON') ) { 00342 define('MAGPIE_CACHE_ON', true); 00343 } 00344 00345 if ( !defined('MAGPIE_CACHE_DIR') ) { 00346 define('MAGPIE_CACHE_DIR', './cache'); 00347 } 00348 00349 if ( !defined('MAGPIE_CACHE_AGE') ) { 00350 define('MAGPIE_CACHE_AGE', 60*60); // one hour 00351 } 00352 00353 if ( !defined('MAGPIE_CACHE_FRESH_ONLY') ) { 00354 define('MAGPIE_CACHE_FRESH_ONLY', false); 00355 } 00356 00357 if ( !defined('MAGPIE_OUTPUT_ENCODING') ) { 00358 define('MAGPIE_OUTPUT_ENCODING', 'ISO-8859-1'); 00359 } 00360 00361 if ( !defined('MAGPIE_INPUT_ENCODING') ) { 00362 define('MAGPIE_INPUT_ENCODING', null); 00363 } 00364 00365 if ( !defined('MAGPIE_DETECT_ENCODING') ) { 00366 define('MAGPIE_DETECT_ENCODING', true); 00367 } 00368 00369 if ( !defined('MAGPIE_DEBUG') ) { 00370 define('MAGPIE_DEBUG', 0); 00371 } 00372 00373 if ( !defined('MAGPIE_USER_AGENT') ) { 00374 $ua = 'MagpieRSS/'. MAGPIE_VERSION . ' (+http://magpierss.sf.net'; 00375 00376 if ( MAGPIE_CACHE_ON ) { 00377 $ua = $ua . ')'; 00378 } 00379 else { 00380 $ua = $ua . '; No cache)'; 00381 } 00382 00383 define('MAGPIE_USER_AGENT', $ua); 00384 } 00385 00386 if ( !defined('MAGPIE_FETCH_TIME_OUT') ) { 00387 define('MAGPIE_FETCH_TIME_OUT', 5); // 5 second timeout 00388 } 00389 00390 // use gzip encoding to fetch rss files if supported? 00391 if ( !defined('MAGPIE_USE_GZIP') ) { 00392 define('MAGPIE_USE_GZIP', true); 00393 } 00394 } 00395 00396 // NOTE: the following code should really be in Snoopy, or at least 00397 // somewhere other then rss_fetch! 00398 00399 /*=======================================================================*\ 00400 HTTP STATUS CODE PREDICATES 00401 These functions attempt to classify an HTTP status code 00402 based on RFC 2616 and RFC 2518. 00403 00404 All of them take an HTTP status code as input, and return true or false 00405 00406 All this code is adapted from LWP's HTTP::Status. 00407 \*=======================================================================*/ 00408 00409 00410 /*=======================================================================*\ 00411 Function: is_info 00412 Purpose: return true if Informational status code 00413 \*=======================================================================*/ 00414 function is_info ($sc) { 00415 return $sc >= 100 && $sc < 200; 00416 } 00417 00418 /*=======================================================================*\ 00419 Function: is_success 00420 Purpose: return true if Successful status code 00421 \*=======================================================================*/ 00422 function is_success ($sc) { 00423 return $sc >= 200 && $sc < 300; 00424 } 00425 00426 /*=======================================================================*\ 00427 Function: is_redirect 00428 Purpose: return true if Redirection status code 00429 \*=======================================================================*/ 00430 function is_redirect ($sc) { 00431 return $sc >= 300 && $sc < 400; 00432 } 00433 00434 /*=======================================================================*\ 00435 Function: is_error 00436 Purpose: return true if Error status code 00437 \*=======================================================================*/ 00438 function is_error ($sc) { 00439 return $sc >= 400 && $sc < 600; 00440 } 00441 00442 /*=======================================================================*\ 00443 Function: is_client_error 00444 Purpose: return true if Error status code, and its a client error 00445 \*=======================================================================*/ 00446 function is_client_error ($sc) { 00447 return $sc >= 400 && $sc < 500; 00448 } 00449 00450 /*=======================================================================*\ 00451 Function: is_client_error 00452 Purpose: return true if Error status code, and its a server error 00453 \*=======================================================================*/ 00454 function is_server_error ($sc) { 00455 return $sc >= 500 && $sc < 600; 00456 } 00457 00458 ?>
For more help developing with SiT! see http://sitracker.org/wiki/DevelopmentHowTo