00001 <?php
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 @require_once('config.inc.php');
00013 require_once('html.inc.php');
00014 require_once('util.inc.php');
00015
00016
00017
00018
00019
00020
00021
00022
00023 function html_encode_str_smart($html)
00024 {
00025
00026 log_msg('debug', 'html_encode_str_smart: string is '.quot(var_dump_inl($html)));
00027
00028
00029 $cur = 0;
00030 while ($cur < strlen($html)) {
00031
00032 if (substr($html, $cur, 1) == '&') {
00033 $replace = true;
00034
00035 $reason = false;
00036
00037
00038 if ($cur+3 < strlen($html) && substr($html, $cur+1, 1) == '#') {
00039
00040 if (strtolower(substr($html, $cur+2, 1)) == 'x') {
00041
00042 $ahead = $cur+3;
00043 while ($ahead < strlen($html)) {
00044 $char = strtolower(substr($html, $ahead, 1));
00045 if ((48 <= ord($char) && ord($char) <= 57) || (97 <= ord($char) && ord($char) <= 102)) {
00046
00047 $ahead++;
00048 } elseif ($char == ';') {
00049 if ($cur+3 < $ahead) {
00050
00051 $replace = false;
00052 break;
00053 } else {
00054
00055
00056 $reason = 1;
00057 break;
00058 }
00059 } else {
00060
00061
00062 $reason = 2;
00063 break;
00064 }
00065 }
00066 if ($ahead == strlen($html)) {
00067
00068 $reason = 3;
00069 }
00070 } elseif (is_numeric(substr($html, $cur+2, 1))) {
00071
00072 $ahead = $cur+3;
00073 while ($ahead < strlen($html)) {
00074 $char = substr($html, $ahead, 1);
00075 if (48 <= ord($char) && ord($char) <= 57) {
00076
00077 $ahead++;
00078 } elseif ($char == ';') {
00079
00080 $replace = false;
00081 break;
00082 } else {
00083
00084 $reason = 4;
00085 break;
00086 }
00087 }
00088 if ($ahead == strlen($html)) {
00089
00090 $reason = 5;
00091 }
00092 } else {
00093
00094 $reason = 6;
00095 }
00096 } else {
00097
00098
00099
00100 $ahead = $cur+1;
00101 while ($ahead < strlen($html)) {
00102 $char = strtolower(substr($html, $ahead, 1));
00103 if ((48 <= ord($char) && ord($char) <= 57) || (97 <= ord($char) && ord($char) <= 122)) {
00104
00105 $ahead++;
00106 } elseif ($char == ';') {
00107 if ($cur+1 < $ahead) {
00108
00109 $replace = false;
00110 break;
00111 } else {
00112
00113
00114 $reason = 7;
00115 break;
00116 }
00117 } else {
00118
00119
00120 $reason = 8;
00121 break;
00122 }
00123 }
00124 if ($ahead == strlen($html)) {
00125 $reason = 9;
00126 break;
00127 }
00128 }
00129
00130 if ($replace) {
00131 log_msg('debug', 'html_encode_str_smart: replacing ampersand at '.$cur.' because of '.$reason);
00132 $html = substr($html, 0, $cur).'&'.substr($html, $cur+1);
00133 log_msg('debug', 'html_encode_str_smart: new string is '.quot(var_dump_inl($html)));
00134 $cur += 5;
00135 } else {
00136 log_msg('debug', 'html_encode_str_smart: not replacing ampersand at '.$cur);
00137 $cur++;
00138 }
00139 } else {
00140 $cur++;
00141 }
00142 }
00143
00144
00145 $cur = 0;
00146 while ($cur < strlen($html)) {
00147 $char = substr($html, $cur, 1);
00148 $replace = true;
00149
00150 if ($char == '<') {
00151
00152
00153 $ahead = $cur+1;
00154 while ($ahead < strlen($html)) {
00155 $c = strtolower(substr($html, $ahead, 1));
00156 if ($c == '<') {
00157
00158
00159
00160 $reason = 1;
00161 break;
00162 } elseif ($c == '>') {
00163 if ($cur+1 < $ahead) {
00164
00165 $replace = false;
00166
00167 $cur = $ahead;
00168 break;
00169 } else {
00170
00171
00172 $reason = 2;
00173 break;
00174 }
00175 } elseif ($ahead == $cur+1) {
00176 if ((48 <= ord($c) && ord($c) <= 57) || (97 <= ord($c) && ord($c) <= 122) || $c == '/') {
00177
00178 } else {
00179
00180 $reason = 3;
00181 break;
00182 }
00183 }
00184 $ahead++;
00185 }
00186 if ($ahead == strlen($html)) {
00187
00188 $reason = 4;
00189 }
00190 } else if ($char == '>') {
00191
00192
00193 $reason = 5;
00194 }
00195
00196 if ($replace && $char == '<') {
00197 log_msg('debug', 'html_encode_str_smart: replacing opening bracket at '.$cur.' because of '.$reason);
00198 $html = substr($html, 0, $cur).'<'.substr($html, $cur+1);
00199 log_msg('debug', 'html_encode_str_smart: new string is '.quot(var_dump_inl($html)));
00200 $cur += 4;
00201 } elseif ($replace && $char == '>') {
00202 log_msg('debug', 'html_encode_str_smart: replacing closing bracket at '.$cur.' because of '.$reason);
00203 $html = substr($html, 0, $cur).'>'.substr($html, $cur+1);
00204 log_msg('debug', 'html_encode_str_smart: new string is '.quot(var_dump_inl($html)));
00205 $cur += 4;
00206 } else {
00207 $cur++;
00208 }
00209 }
00210
00211 return $html;
00212 }
00213
00214
00215
00216
00217
00218
00219
00220
00221
00222
00223
00224
00225
00226 function html_parse($html, $recursive = false)
00227 {
00228 global $single_tags;
00229
00230 $ret = array();
00231
00232 $pos = 0;
00233 $open_tag = false;
00234 $open_pos = false;
00235
00236 $close_pos = false;
00237 $num_open = 0;
00238
00239 while ($pos < strlen($html)) {
00240 if ($html[$pos] == '<') {
00241 if (($next = strpos($html, '>', $pos+1)) === false) {
00242
00243 $pos++;
00244 continue;
00245 }
00246
00247 $tag = strtolower(trim(substr($html, $pos+1, $next-$pos-1)));
00248
00249 if (substr($tag, 0, 1) !== '/') {
00250
00251 if ($num_open == 0) {
00252 $a = expl_whitesp($tag, true);
00253 $open_tag = $a[0];
00254 $open_pos = $pos;
00255 }
00256
00257 if (in_array($tag, $single_tags)) {
00258 if ($num_open == 0) {
00259
00260 $text = '';
00261 if ($close_pos === false && 0 < $open_pos) {
00262 $text = trim(substr($html, 0, $open_pos));
00263 } elseif ($close_pos !== false && $close_pos+1 < $open_pos) {
00264 $text = trim(substr($html, $close_pos+1, $open_pos-$close_pos-1));
00265 }
00266 if (0 < strlen($text)) {
00267 $ret[] = $text;
00268 }
00269 $close_pos = $next;
00270 $ret[] = html_parse_elem(substr($html, $open_pos, $close_pos-$open_pos+1), $recursive);
00271 }
00272 } else {
00273 $num_open++;
00274 }
00275 } else {
00276
00277 $num_open--;
00278 if ($num_open == 0) {
00279
00280 if ($open_tag != substr($tag, 1)) {
00281
00282 $pos++;
00283 continue;
00284 }
00285
00286 $text = '';
00287 if ($close_pos === false && 0 < $open_pos) {
00288 $text = trim(substr($html, 0, $open_pos));
00289 } elseif ($close_pos !== false && $close_pos+1 < $open_pos) {
00290 $text = trim(substr($html, $close_pos+1, $open_pos-$close_pos-1));
00291 }
00292 if (0 < strlen($text)) {
00293 $ret[] = $text;
00294 }
00295 $close_pos = $next;
00296 $ret[] = html_parse_elem(substr($html, $open_pos, $close_pos-$open_pos+1), $recursive);
00297 }
00298 }
00299 }
00300 $pos++;
00301 }
00302
00303 $text = '';
00304 if ($close_pos === false && 1 < $pos) {
00305 $text = trim($html);
00306 } elseif ($close_pos !== false && $close_pos+1 < $pos) {
00307 $text = trim(substr($html, $close_pos+1));
00308 }
00309 if (0 < strlen($text)) {
00310 $ret[] = $text;
00311 }
00312
00313 return $ret;
00314 }
00315
00316
00317
00318
00319
00320
00321
00322
00323
00324
00325
00326
00327
00328 function html_parse_elem($html, $recursive = false)
00329 {
00330 global $single_tags;
00331 $quot = array('"', "'");
00332
00333 $ret = array();
00334
00335
00336 $next = strpos($html, '>', 1);
00337 $a = expl_whitesp(substr($html, 1, $next-1), true);
00338 if (count($a) < 1) {
00339 return $ret;
00340 } else {
00341 $ret['tag'] = strtolower($a[0]);
00342 }
00343
00344
00345
00346 for ($i=1; $i < count($a); $i++) {
00347 if ($a[$i] == '=' && 1 < $i && $i+1 < count($a)) {
00348 $a[$i] = $a[$i-1].'='.$a[$i+1];
00349 array_splice($a, $i-1, 1);
00350 array_splice($a, $i, 1);
00351 $i--;
00352 } elseif (substr($a[$i], -1) == '=' && $i+1 < count($a)) {
00353 $a[$i] .= $a[$i+1];
00354 array_splice($a, $i+1, 1);
00355 $i--;
00356 } elseif (substr($a[$i], 0, 1) == '=' && 1 < $i) {
00357 $a[$i] = $a[$i-1].$a[$i];
00358 array_splice($a, $i-1, 1);
00359 $i--;
00360 }
00361 }
00362
00363
00364 for ($i=1; $i < count($a); $i++) {
00365 if (($equal = strpos($a[$i], '=')) === false) {
00366 $attr = strtolower(htmlspecialchars_decode($a[$i], ENT_QUOTES));
00367 $ret[$attr] = $attr;
00368 } else {
00369 $attr = strtolower(htmlspecialchars_decode(substr($a[$i], 0, $equal), ENT_QUOTES));
00370 $val = htmlspecialchars_decode(substr($a[$i], $equal+1), ENT_QUOTES);
00371
00372 if (in_array(substr($val, 0, 1), $quot) && substr($val, 0, 1) == substr($val, -1)) {
00373 $val = substr($val, 1, -1);
00374 }
00375
00376 if ($attr == 'class') {
00377 $val = expl(' ', $val);
00378 } elseif ($attr == 'style') {
00379 $styles = expl(';', $val);
00380 $val = array();
00381 foreach ($styles as $style) {
00382 $temp = expl(':', $style);
00383 if (1 < count($temp)) {
00384 $val[strtolower(trim($temp[0]))] = trim(implode(':', array_slice($temp, 1)));
00385 }
00386 }
00387 }
00388 $ret[$attr] = $val;
00389 }
00390 }
00391
00392
00393 if (!in_array($ret['tag'], $single_tags)) {
00394
00395 if (($last = strrpos($html, '<')) !== false) {
00396
00397 if (strtolower(substr($html, $last+2, -1)) == $ret['tag']) {
00398 $ret['val'] = trim(substr($html, $next+1, $last-$next-1));
00399 if ($recursive) {
00400 $ret['val'] = html_parse($ret['val'], true);
00401 }
00402 }
00403 }
00404 }
00405
00406 return $ret;
00407 }