00001 <?php 00002 00023 define('RSS', 'RSS'); 00024 define('ATOM', 'Atom'); 00025 00026 require_once (MAGPIE_DIR . 'rss_utils.inc'); 00027 00034 class MagpieRSS { 00035 var $parser; 00036 00037 var $current_item = array(); // item currently being parsed 00038 var $items = array(); // collection of parsed items 00039 var $channel = array(); // hash of channel fields 00040 var $textinput = array(); 00041 var $image = array(); 00042 var $feed_type; 00043 var $feed_version; 00044 var $encoding = ''; // output encoding of parsed rss 00045 00046 var $_source_encoding = ''; // only set if we have to parse xml prolog 00047 00048 var $ERROR = ""; 00049 var $WARNING = ""; 00050 00051 // define some constants 00052 00053 var $_CONTENT_CONSTRUCTS = array('content', 'summary', 'info', 'title', 'tagline', 'copyright'); 00054 var $_KNOWN_ENCODINGS = array('UTF-8', 'US-ASCII', 'ISO-8859-1'); 00055 00056 // parser variables, useless if you're not a parser, treat as private 00057 var $stack = array(); // parser stack 00058 var $inchannel = false; 00059 var $initem = false; 00060 var $incontent = false; // if in Atom <content mode="xml"> field 00061 var $intextinput = false; 00062 var $inimage = false; 00063 var $current_namespace = false; 00064 00065 00094 function MagpieRSS ($source, $output_encoding='ISO-8859-1', 00095 $input_encoding=null, $detect_encoding=true) 00096 { 00097 # if PHP xml isn't compiled in, die 00098 # 00099 if (!function_exists('xml_parser_create')) { 00100 $this->error( "Failed to load PHP's XML Extension. " . 00101 "http://www.php.net/manual/en/ref.xml.php", 00102 E_USER_ERROR ); 00103 } 00104 00105 list($parser, $source) = $this->create_parser($source, 00106 $output_encoding, $input_encoding, $detect_encoding); 00107 00108 00109 if (!is_resource($parser)) { 00110 $this->error( "Failed to create an instance of PHP's XML parser. " . 00111 "http://www.php.net/manual/en/ref.xml.php", 00112 E_USER_ERROR ); 00113 } 00114 00115 00116 $this->parser = $parser; 00117 00118 # pass in parser, and a reference to this object 00119 # setup handlers 00120 # 00121 xml_set_object( $this->parser, $this ); 00122 xml_set_element_handler($this->parser, 00123 'feed_start_element', 'feed_end_element' ); 00124 00125 xml_set_character_data_handler( $this->parser, 'feed_cdata' ); 00126 00127 $status = xml_parse( $this->parser, $source ); 00128 00129 if (! $status ) { 00130 $errorcode = xml_get_error_code( $this->parser ); 00131 if ( $errorcode != XML_ERROR_NONE ) { 00132 $xml_error = xml_error_string( $errorcode ); 00133 $error_line = xml_get_current_line_number($this->parser); 00134 $error_col = xml_get_current_column_number($this->parser); 00135 $errormsg = "$xml_error at line $error_line, column $error_col"; 00136 00137 $this->error( $errormsg ); 00138 } 00139 } 00140 00141 xml_parser_free( $this->parser ); 00142 00143 $this->normalize(); 00144 } 00145 00146 function feed_start_element($p, $element, &$attrs) { 00147 $el = $element = strtolower($element); 00148 $attrs = array_change_key_case($attrs, CASE_LOWER); 00149 00150 // check for a namespace, and split if found 00151 $ns = false; 00152 if ( strpos( $element, ':' ) ) { 00153 list($ns, $el) = split( ':', $element, 2); 00154 } 00155 if ( $ns and $ns != 'rdf' ) { 00156 $this->current_namespace = $ns; 00157 } 00158 00159 # if feed type isn't set, then this is first element of feed 00160 # identify feed from root element 00161 # 00162 if (!isset($this->feed_type) ) { 00163 if ( $el == 'rdf' ) { 00164 $this->feed_type = RSS; 00165 $this->feed_version = '1.0'; 00166 } 00167 elseif ( $el == 'rss' ) { 00168 $this->feed_type = RSS; 00169 $this->feed_version = $attrs['version']; 00170 } 00171 elseif ( $el == 'feed' ) { 00172 $this->feed_type = ATOM; 00173 $this->feed_version = $attrs['version']; 00174 $this->inchannel = true; 00175 } 00176 return; 00177 } 00178 00179 if ( $el == 'channel' ) 00180 { 00181 $this->inchannel = true; 00182 } 00183 elseif ($el == 'item' or $el == 'entry' ) 00184 { 00185 $this->initem = true; 00186 if ( isset($attrs['rdf:about']) ) { 00187 $this->current_item['about'] = $attrs['rdf:about']; 00188 } 00189 } 00190 00191 // if we're in the default namespace of an RSS feed, 00192 // record textinput or image fields 00193 elseif ( 00194 $this->feed_type == RSS and 00195 $this->current_namespace == '' and 00196 $el == 'textinput' ) 00197 { 00198 $this->intextinput = true; 00199 } 00200 00201 elseif ( 00202 $this->feed_type == RSS and 00203 $this->current_namespace == '' and 00204 $el == 'image' ) 00205 { 00206 $this->inimage = true; 00207 } 00208 00209 # handle atom content constructs 00210 elseif ( $this->feed_type == ATOM and in_array($el, $this->_CONTENT_CONSTRUCTS) ) 00211 { 00212 // avoid clashing w/ RSS mod_content 00213 if ($el == 'content' ) { 00214 $el = 'atom_content'; 00215 } 00216 00217 $this->incontent = $el; 00218 00219 00220 } 00221 00222 // if inside an Atom content construct (e.g. content or summary) field treat tags as text 00223 elseif ($this->feed_type == ATOM and $this->incontent ) 00224 { 00225 // if tags are inlined, then flatten 00226 $attrs_str = join(' ', 00227 array_map('map_attrs', 00228 array_keys($attrs), 00229 array_values($attrs) ) ); 00230 00231 $this->append_content( "<$element $attrs_str>" ); 00232 00233 array_unshift( $this->stack, $el ); 00234 } 00235 00236 // Atom support many links per containging element. 00237 // Magpie treats link elements of type rel='alternate' 00238 // as being equivalent to RSS's simple link element. 00239 // 00240 elseif ($this->feed_type == ATOM and $el == 'link' ) 00241 { 00242 if ( isset($attrs['rel']) and $attrs['rel'] == 'alternate' ) 00243 { 00244 $link_el = 'link'; 00245 } 00246 else { 00247 $link_el = 'link_' . $attrs['rel']; 00248 } 00249 00250 $this->append($link_el, $attrs['href']); 00251 } 00252 // set stack[0] to current element 00253 else { 00254 array_unshift($this->stack, $el); 00255 } 00256 } 00257 00258 00259 00260 function feed_cdata ($p, $text) { 00261 if ($this->feed_type == ATOM and $this->incontent) 00262 { 00263 $this->append_content( $text ); 00264 } 00265 else { 00266 $current_el = join('_', array_reverse($this->stack)); 00267 $this->append($current_el, $text); 00268 } 00269 } 00270 00271 function feed_end_element ($p, $el) { 00272 $el = strtolower($el); 00273 00274 if ( $el == 'item' or $el == 'entry' ) 00275 { 00276 $this->items[] = $this->current_item; 00277 $this->current_item = array(); 00278 $this->initem = false; 00279 } 00280 elseif ($this->feed_type == RSS and $this->current_namespace == '' and $el == 'textinput' ) 00281 { 00282 $this->intextinput = false; 00283 } 00284 elseif ($this->feed_type == RSS and $this->current_namespace == '' and $el == 'image' ) 00285 { 00286 $this->inimage = false; 00287 } 00288 elseif ($this->feed_type == ATOM and in_array($el, $this->_CONTENT_CONSTRUCTS) ) 00289 { 00290 $this->incontent = false; 00291 } 00292 elseif ($el == 'channel' or $el == 'feed' ) 00293 { 00294 $this->inchannel = false; 00295 } 00296 elseif ($this->feed_type == ATOM and $this->incontent ) { 00297 // balance tags properly 00298 // note: i don't think this is actually neccessary 00299 if ( $this->stack[0] == $el ) 00300 { 00301 $this->append_content("</$el>"); 00302 } 00303 else { 00304 $this->append_content("<$el />"); 00305 } 00306 00307 array_shift( $this->stack ); 00308 } 00309 else { 00310 array_shift( $this->stack ); 00311 } 00312 00313 $this->current_namespace = false; 00314 } 00315 00316 function concat (&$str1, $str2="") { 00317 if (!isset($str1) ) { 00318 $str1=""; 00319 } 00320 $str1 .= $str2; 00321 } 00322 00323 00324 00325 function append_content($text) { 00326 if ( $this->initem ) { 00327 $this->concat( $this->current_item[ $this->incontent ], $text ); 00328 } 00329 elseif ( $this->inchannel ) { 00330 $this->concat( $this->channel[ $this->incontent ], $text ); 00331 } 00332 } 00333 00334 // smart append - field and namespace aware 00335 function append($el, $text) { 00336 if (!$el) { 00337 return; 00338 } 00339 if ( $this->current_namespace ) 00340 { 00341 if ( $this->initem ) { 00342 $this->concat( 00343 $this->current_item[ $this->current_namespace ][ $el ], $text); 00344 } 00345 elseif ($this->inchannel) { 00346 $this->concat( 00347 $this->channel[ $this->current_namespace][ $el ], $text ); 00348 } 00349 elseif ($this->intextinput) { 00350 $this->concat( 00351 $this->textinput[ $this->current_namespace][ $el ], $text ); 00352 } 00353 elseif ($this->inimage) { 00354 $this->concat( 00355 $this->image[ $this->current_namespace ][ $el ], $text ); 00356 } 00357 } 00358 else { 00359 if ( $this->initem ) { 00360 $this->concat( 00361 $this->current_item[ $el ], $text); 00362 } 00363 elseif ($this->intextinput) { 00364 $this->concat( 00365 $this->textinput[ $el ], $text ); 00366 } 00367 elseif ($this->inimage) { 00368 $this->concat( 00369 $this->image[ $el ], $text ); 00370 } 00371 elseif ($this->inchannel) { 00372 $this->concat( 00373 $this->channel[ $el ], $text ); 00374 } 00375 00376 } 00377 } 00378 00379 function normalize () { 00380 // if atom populate rss fields 00381 if ( $this->is_atom() ) { 00382 $this->channel['description'] = $this->channel['tagline']; 00383 for ( $i = 0; $i < count($this->items); $i++) { 00384 $item = $this->items[$i]; 00385 if ( isset($item['summary']) ) 00386 $item['description'] = $item['summary']; 00387 if ( isset($item['atom_content'])) 00388 $item['content']['encoded'] = $item['atom_content']; 00389 00390 $atom_date = (isset($item['issued']) ) ? $item['issued'] : $item['modified']; 00391 if ( $atom_date ) { 00392 $epoch = @parse_w3cdtf($atom_date); 00393 if ($epoch and $epoch > 0) { 00394 $item['date_timestamp'] = $epoch; 00395 } 00396 } 00397 00398 $this->items[$i] = $item; 00399 } 00400 } 00401 elseif ( $this->is_rss() ) { 00402 $this->channel['tagline'] = $this->channel['description']; 00403 for ( $i = 0; $i < count($this->items); $i++) { 00404 $item = $this->items[$i]; 00405 if ( isset($item['description'])) 00406 $item['summary'] = $item['description']; 00407 if ( isset($item['content']['encoded'] ) ) 00408 $item['atom_content'] = $item['content']['encoded']; 00409 00410 if ( $this->is_rss() == '1.0' and isset($item['dc']['date']) ) { 00411 $epoch = @parse_w3cdtf($item['dc']['date']); 00412 if ($epoch and $epoch > 0) { 00413 $item['date_timestamp'] = $epoch; 00414 } 00415 } 00416 elseif ( isset($item['pubdate']) ) { 00417 $epoch = @strtotime($item['pubdate']); 00418 if ($epoch > 0) { 00419 $item['date_timestamp'] = $epoch; 00420 } 00421 } 00422 00423 $this->items[$i] = $item; 00424 } 00425 } 00426 } 00427 00428 00429 function is_rss () { 00430 if ( $this->feed_type == RSS ) { 00431 return $this->feed_version; 00432 } 00433 else { 00434 return false; 00435 } 00436 } 00437 00438 function is_atom() { 00439 if ( $this->feed_type == ATOM ) { 00440 return $this->feed_version; 00441 } 00442 else { 00443 return false; 00444 } 00445 } 00446 00451 function create_parser($source, $out_enc, $in_enc, $detect) { 00452 if ( substr(phpversion(),0,1) == 5) { 00453 $parser = $this->php5_create_parser($in_enc, $detect); 00454 } 00455 else { 00456 list($parser, $source) = $this->php4_create_parser($source, $in_enc, $detect); 00457 } 00458 if ($out_enc) { 00459 $this->encoding = $out_enc; 00460 xml_parser_set_option($parser, XML_OPTION_TARGET_ENCODING, $out_enc); 00461 } 00462 00463 return array($parser, $source); 00464 } 00465 00475 function php5_create_parser($in_enc, $detect) { 00476 // by default php5 does a fine job of detecting input encodings 00477 if(!$detect && $in_enc) { 00478 return xml_parser_create($in_enc); 00479 } 00480 else { 00481 return xml_parser_create(''); 00482 } 00483 } 00484 00500 function php4_create_parser($source, $in_enc, $detect) { 00501 if ( !$detect ) { 00502 return array(xml_parser_create($in_enc), $source); 00503 } 00504 00505 if (!$in_enc) { 00506 if (preg_match('/<?xml.*encoding=[\'"](.*?)[\'"].*?>/m', $source, $m)) { 00507 $in_enc = strtoupper($m[1]); 00508 $this->source_encoding = $in_enc; 00509 } 00510 else { 00511 $in_enc = 'UTF-8'; 00512 } 00513 } 00514 00515 if ($this->known_encoding($in_enc)) { 00516 return array(xml_parser_create($in_enc), $source); 00517 } 00518 00519 // the dectected encoding is not one of the simple encodings PHP knows 00520 00521 // attempt to use the iconv extension to 00522 // cast the XML to a known encoding 00523 // @see http://php.net/iconv 00524 00525 if (function_exists('iconv')) { 00526 $encoded_source = iconv($in_enc,'UTF-8', $source); 00527 if ($encoded_source) { 00528 return array(xml_parser_create('UTF-8'), $encoded_source); 00529 } 00530 } 00531 00532 // iconv didn't work, try mb_convert_encoding 00533 // @see http://php.net/mbstring 00534 if(function_exists('mb_convert_encoding')) { 00535 $encoded_source = mb_convert_encoding($source, 'UTF-8', $in_enc ); 00536 if ($encoded_source) { 00537 return array(xml_parser_create('UTF-8'), $encoded_source); 00538 } 00539 } 00540 00541 // else 00542 $this->error("Feed is in an unsupported character encoding. ($in_enc) " . 00543 "You may see strange artifacts, and mangled characters.", 00544 E_USER_NOTICE); 00545 00546 return array(xml_parser_create(), $source); 00547 } 00548 00549 function known_encoding($enc) { 00550 $enc = strtoupper($enc); 00551 if ( in_array($enc, $this->_KNOWN_ENCODINGS) ) { 00552 return $enc; 00553 } 00554 else { 00555 return false; 00556 } 00557 } 00558 00559 function error ($errormsg, $lvl=E_USER_WARNING) { 00560 // append PHP's error message if track_errors enabled 00561 if ( isset($php_errormsg) ) { 00562 $errormsg .= " ($php_errormsg)"; 00563 } 00564 if ( MAGPIE_DEBUG ) { 00565 trigger_error( $errormsg, $lvl); 00566 } 00567 else { 00568 error_log( $errormsg, 0); 00569 } 00570 00571 $notices = E_USER_NOTICE|E_NOTICE; 00572 if ( $lvl&$notices ) { 00573 $this->WARNING = $errormsg; 00574 } else { 00575 $this->ERROR = $errormsg; 00576 } 00577 } 00578 00579 00580 } // end class RSS 00581 00582 function map_attrs($k, $v) { 00583 return "$k=\"$v\""; 00584 } 00585 00586 // patch to support medieval versions of PHP4.1.x, 00587 // courtesy, Ryan Currie, ryan@digibliss.com 00588 00589 if (!function_exists('array_change_key_case')) { 00590 define("CASE_UPPER",1); 00591 define("CASE_LOWER",0); 00592 00593 00594 function array_change_key_case($array,$case=CASE_LOWER) { 00595 if ($case=CASE_LOWER) $cmd=strtolower; 00596 elseif ($case=CASE_UPPER) $cmd=strtoupper; 00597 foreach($array as $key=>$value) { 00598 $output[$cmd($key)]=$value; 00599 } 00600 return $output; 00601 } 00602 00603 } 00604 00605 ?>
For more help developing with SiT! see http://sitracker.org/wiki/DevelopmentHowTo