simple_html_dom.php 58 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353
  1. <?php
  2. /**
  3. * Website: http://sourceforge.net/projects/simplehtmldom/
  4. * Additional projects: http://sourceforge.net/projects/debugobject/
  5. * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
  6. *
  7. * Licensed under The MIT License
  8. * See the LICENSE file in the project root for more information.
  9. *
  10. * Authors:
  11. * S.C. Chen
  12. * John Schlick
  13. * Rus Carroll
  14. * logmanoriginal
  15. *
  16. * Contributors:
  17. * Yousuke Kumakura
  18. * Vadim Voituk
  19. * Antcs
  20. *
  21. * Version Rev. 1.9.1 (291)
  22. */
  23. define('HDOM_TYPE_ELEMENT', 1);
  24. define('HDOM_TYPE_COMMENT', 2);
  25. define('HDOM_TYPE_TEXT', 3);
  26. define('HDOM_TYPE_ENDTAG', 4);
  27. define('HDOM_TYPE_ROOT', 5);
  28. define('HDOM_TYPE_UNKNOWN', 6);
  29. define('HDOM_QUOTE_DOUBLE', 0);
  30. define('HDOM_QUOTE_SINGLE', 1);
  31. define('HDOM_QUOTE_NO', 3);
  32. define('HDOM_INFO_BEGIN', 0);
  33. define('HDOM_INFO_END', 1);
  34. define('HDOM_INFO_QUOTE', 2);
  35. define('HDOM_INFO_SPACE', 3);
  36. define('HDOM_INFO_TEXT', 4);
  37. define('HDOM_INFO_INNER', 5);
  38. define('HDOM_INFO_OUTER', 6);
  39. define('HDOM_INFO_ENDSPACE', 7);
  40. defined('DEFAULT_TARGET_CHARSET') || define('DEFAULT_TARGET_CHARSET', 'UTF-8');
  41. defined('DEFAULT_BR_TEXT') || define('DEFAULT_BR_TEXT', "\r\n");
  42. defined('DEFAULT_SPAN_TEXT') || define('DEFAULT_SPAN_TEXT', ' ');
  43. defined('MAX_FILE_SIZE') || define('MAX_FILE_SIZE', 600000);
  44. define('HDOM_SMARTY_AS_TEXT', 1);
  45. function file_get_html(
  46. $url,
  47. $use_include_path = false,
  48. $context = null,
  49. $offset = 0,
  50. $maxLen = -1,
  51. $lowercase = true,
  52. $forceTagsClosed = true,
  53. $target_charset = DEFAULT_TARGET_CHARSET,
  54. $stripRN = true,
  55. $defaultBRText = DEFAULT_BR_TEXT,
  56. $defaultSpanText = DEFAULT_SPAN_TEXT)
  57. {
  58. if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; }
  59. $dom = new simple_html_dom(
  60. null,
  61. $lowercase,
  62. $forceTagsClosed,
  63. $target_charset,
  64. $stripRN,
  65. $defaultBRText,
  66. $defaultSpanText
  67. );
  68. /**
  69. * For sourceforge users: uncomment the next line and comment the
  70. * retrieve_url_contents line 2 lines down if it is not already done.
  71. */
  72. $contents = file_get_contents(
  73. $url,
  74. $use_include_path,
  75. $context,
  76. $offset,
  77. $maxLen
  78. );
  79. // $contents = retrieve_url_contents($url);
  80. if (empty($contents) || strlen($contents) > $maxLen) {
  81. $dom->clear();
  82. return false;
  83. }
  84. return $dom->load($contents, $lowercase, $stripRN);
  85. }
  86. function str_get_html(
  87. $str,
  88. $lowercase = true,
  89. $forceTagsClosed = true,
  90. $target_charset = DEFAULT_TARGET_CHARSET,
  91. $stripRN = true,
  92. $defaultBRText = DEFAULT_BR_TEXT,
  93. $defaultSpanText = DEFAULT_SPAN_TEXT)
  94. {
  95. $dom = new simple_html_dom(
  96. null,
  97. $lowercase,
  98. $forceTagsClosed,
  99. $target_charset,
  100. $stripRN,
  101. $defaultBRText,
  102. $defaultSpanText
  103. );
  104. if (empty($str) || strlen($str) > MAX_FILE_SIZE) {
  105. $dom->clear();
  106. return false;
  107. }
  108. return $dom->load($str, $lowercase, $stripRN);
  109. }
  110. function dump_html_tree($node, $show_attr = true, $deep = 0)
  111. {
  112. $node->dump($node);
  113. }
  114. class simple_html_dom_node
  115. {
  116. public $nodetype = HDOM_TYPE_TEXT;
  117. public $tag = 'text';
  118. public $attr = array();
  119. public $children = array();
  120. public $nodes = array();
  121. public $parent = null;
  122. public $_ = array();
  123. public $tag_start = 0;
  124. private $dom = null;
  125. function __construct($dom)
  126. {
  127. $this->dom = $dom;
  128. $dom->nodes[] = $this;
  129. }
  130. function __destruct()
  131. {
  132. $this->clear();
  133. }
  134. function __toString()
  135. {
  136. return $this->outertext();
  137. }
  138. function clear()
  139. {
  140. $this->dom = null;
  141. $this->nodes = null;
  142. $this->parent = null;
  143. $this->children = null;
  144. }
  145. function dump($show_attr = true, $depth = 0)
  146. {
  147. echo str_repeat("\t", $depth) . $this->tag;
  148. if ($show_attr && count($this->attr) > 0) {
  149. echo '(';
  150. foreach ($this->attr as $k => $v) {
  151. echo "[$k]=>\"$v\", ";
  152. }
  153. echo ')';
  154. }
  155. echo "\n";
  156. if ($this->nodes) {
  157. foreach ($this->nodes as $node) {
  158. $node->dump($show_attr, $depth + 1);
  159. }
  160. }
  161. }
  162. function dump_node($echo = true)
  163. {
  164. $string = $this->tag;
  165. if (count($this->attr) > 0) {
  166. $string .= '(';
  167. foreach ($this->attr as $k => $v) {
  168. $string .= "[$k]=>\"$v\", ";
  169. }
  170. $string .= ')';
  171. }
  172. if (count($this->_) > 0) {
  173. $string .= ' $_ (';
  174. foreach ($this->_ as $k => $v) {
  175. if (is_array($v)) {
  176. $string .= "[$k]=>(";
  177. foreach ($v as $k2 => $v2) {
  178. $string .= "[$k2]=>\"$v2\", ";
  179. }
  180. $string .= ')';
  181. } else {
  182. $string .= "[$k]=>\"$v\", ";
  183. }
  184. }
  185. $string .= ')';
  186. }
  187. if (isset($this->text)) {
  188. $string .= " text: ({$this->text})";
  189. }
  190. $string .= ' HDOM_INNER_INFO: ';
  191. if (isset($node->_[HDOM_INFO_INNER])) {
  192. $string .= "'" . $node->_[HDOM_INFO_INNER] . "'";
  193. } else {
  194. $string .= ' NULL ';
  195. }
  196. $string .= ' children: ' . count($this->children);
  197. $string .= ' nodes: ' . count($this->nodes);
  198. $string .= ' tag_start: ' . $this->tag_start;
  199. $string .= "\n";
  200. if ($echo) {
  201. echo $string;
  202. return;
  203. } else {
  204. return $string;
  205. }
  206. }
  207. function parent($parent = null)
  208. {
  209. // I am SURE that this doesn't work properly.
  210. // It fails to unset the current node from it's current parents nodes or
  211. // children list first.
  212. if ($parent !== null) {
  213. $this->parent = $parent;
  214. $this->parent->nodes[] = $this;
  215. $this->parent->children[] = $this;
  216. }
  217. return $this->parent;
  218. }
  219. function has_child()
  220. {
  221. return !empty($this->children);
  222. }
  223. function children($idx = -1)
  224. {
  225. if ($idx === -1) {
  226. return $this->children;
  227. }
  228. if (isset($this->children[$idx])) {
  229. return $this->children[$idx];
  230. }
  231. return null;
  232. }
  233. function first_child()
  234. {
  235. if (count($this->children) > 0) {
  236. return $this->children[0];
  237. }
  238. return null;
  239. }
  240. function last_child()
  241. {
  242. if (count($this->children) > 0) {
  243. return end($this->children);
  244. }
  245. return null;
  246. }
  247. function next_sibling()
  248. {
  249. if ($this->parent === null) {
  250. return null;
  251. }
  252. $idx = array_search($this, $this->parent->children, true);
  253. if ($idx !== false && isset($this->parent->children[$idx + 1])) {
  254. return $this->parent->children[$idx + 1];
  255. }
  256. return null;
  257. }
  258. function prev_sibling()
  259. {
  260. if ($this->parent === null) {
  261. return null;
  262. }
  263. $idx = array_search($this, $this->parent->children, true);
  264. if ($idx !== false && $idx > 0) {
  265. return $this->parent->children[$idx - 1];
  266. }
  267. return null;
  268. }
  269. function find_ancestor_tag($tag)
  270. {
  271. global $debug_object;
  272. if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
  273. if ($this->parent === null) {
  274. return null;
  275. }
  276. $ancestor = $this->parent;
  277. while (!is_null($ancestor)) {
  278. if (is_object($debug_object)) {
  279. $debug_object->debug_log(2, 'Current tag is: ' . $ancestor->tag);
  280. }
  281. if ($ancestor->tag === $tag) {
  282. break;
  283. }
  284. $ancestor = $ancestor->parent;
  285. }
  286. return $ancestor;
  287. }
  288. function innertext()
  289. {
  290. if (isset($this->_[HDOM_INFO_INNER])) {
  291. return $this->_[HDOM_INFO_INNER];
  292. }
  293. if (isset($this->_[HDOM_INFO_TEXT])) {
  294. return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
  295. }
  296. $ret = '';
  297. foreach ($this->nodes as $n) {
  298. $ret .= $n->outertext();
  299. }
  300. return $ret;
  301. }
  302. function outertext()
  303. {
  304. global $debug_object;
  305. if (is_object($debug_object)) {
  306. $text = '';
  307. if ($this->tag === 'text') {
  308. if (!empty($this->text)) {
  309. $text = ' with text: ' . $this->text;
  310. }
  311. }
  312. $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text);
  313. }
  314. if ($this->tag === 'root') {
  315. return $this->innertext();
  316. }
  317. // todo: What is the use of this callback? Remove?
  318. if ($this->dom && $this->dom->callback !== null) {
  319. call_user_func_array($this->dom->callback, array($this));
  320. }
  321. if (isset($this->_[HDOM_INFO_OUTER])) {
  322. return $this->_[HDOM_INFO_OUTER];
  323. }
  324. if (isset($this->_[HDOM_INFO_TEXT])) {
  325. return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
  326. }
  327. $ret = '';
  328. if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) {
  329. $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
  330. }
  331. if (isset($this->_[HDOM_INFO_INNER])) {
  332. // todo: <br> should either never have HDOM_INFO_INNER or always
  333. if ($this->tag !== 'br') {
  334. $ret .= $this->_[HDOM_INFO_INNER];
  335. }
  336. } elseif ($this->nodes) {
  337. foreach ($this->nodes as $n) {
  338. $ret .= $this->convert_text($n->outertext());
  339. }
  340. }
  341. if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END] != 0) {
  342. $ret .= '</' . $this->tag . '>';
  343. }
  344. return $ret;
  345. }
  346. function text()
  347. {
  348. if (isset($this->_[HDOM_INFO_INNER])) {
  349. return $this->_[HDOM_INFO_INNER];
  350. }
  351. switch ($this->nodetype) {
  352. case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
  353. case HDOM_TYPE_COMMENT: return '';
  354. case HDOM_TYPE_UNKNOWN: return '';
  355. }
  356. if (strcasecmp($this->tag, 'script') === 0) { return ''; }
  357. if (strcasecmp($this->tag, 'style') === 0) { return ''; }
  358. $ret = '';
  359. // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed
  360. // for some span tags, and some p tags) $this->nodes is set to NULL.
  361. // NOTE: This indicates that there is a problem where it's set to NULL
  362. // without a clear happening.
  363. // WHY is this happening?
  364. if (!is_null($this->nodes)) {
  365. foreach ($this->nodes as $n) {
  366. // Start paragraph after a blank line
  367. if ($n->tag === 'p') {
  368. $ret = trim($ret) . "\n\n";
  369. }
  370. $ret .= $this->convert_text($n->text());
  371. // If this node is a span... add a space at the end of it so
  372. // multiple spans don't run into each other. This is plaintext
  373. // after all.
  374. if ($n->tag === 'span') {
  375. $ret .= $this->dom->default_span_text;
  376. }
  377. }
  378. }
  379. return $ret;
  380. }
  381. function xmltext()
  382. {
  383. $ret = $this->innertext();
  384. $ret = str_ireplace('<![CDATA[', '', $ret);
  385. $ret = str_replace(']]>', '', $ret);
  386. return $ret;
  387. }
  388. function makeup()
  389. {
  390. // text, comment, unknown
  391. if (isset($this->_[HDOM_INFO_TEXT])) {
  392. return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
  393. }
  394. $ret = '<' . $this->tag;
  395. $i = -1;
  396. foreach ($this->attr as $key => $val) {
  397. ++$i;
  398. // skip removed attribute
  399. if ($val === null || $val === false) { continue; }
  400. $ret .= $this->_[HDOM_INFO_SPACE][$i][0];
  401. //no value attr: nowrap, checked selected...
  402. if ($val === true) {
  403. $ret .= $key;
  404. } else {
  405. switch ($this->_[HDOM_INFO_QUOTE][$i])
  406. {
  407. case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
  408. case HDOM_QUOTE_SINGLE: $quote = '\''; break;
  409. default: $quote = '';
  410. }
  411. $ret .= $key
  412. . $this->_[HDOM_INFO_SPACE][$i][1]
  413. . '='
  414. . $this->_[HDOM_INFO_SPACE][$i][2]
  415. . $quote
  416. . $val
  417. . $quote;
  418. }
  419. }
  420. $ret = $this->dom->restore_noise($ret);
  421. return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
  422. }
  423. function find($selector, $idx = null, $lowercase = false)
  424. {
  425. $selectors = $this->parse_selector($selector);
  426. if (($count = count($selectors)) === 0) { return array(); }
  427. $found_keys = array();
  428. // find each selector
  429. for ($c = 0; $c < $count; ++$c) {
  430. // The change on the below line was documented on the sourceforge
  431. // code tracker id 2788009
  432. // used to be: if (($levle=count($selectors[0]))===0) return array();
  433. if (($levle = count($selectors[$c])) === 0) { return array(); }
  434. if (!isset($this->_[HDOM_INFO_BEGIN])) { return array(); }
  435. $head = array($this->_[HDOM_INFO_BEGIN] => 1);
  436. $cmd = ' '; // Combinator
  437. // handle descendant selectors, no recursive!
  438. for ($l = 0; $l < $levle; ++$l) {
  439. $ret = array();
  440. foreach ($head as $k => $v) {
  441. $n = ($k === -1) ? $this->dom->root : $this->dom->nodes[$k];
  442. //PaperG - Pass this optional parameter on to the seek function.
  443. $n->seek($selectors[$c][$l], $ret, $cmd, $lowercase);
  444. }
  445. $head = $ret;
  446. $cmd = $selectors[$c][$l][4]; // Next Combinator
  447. }
  448. foreach ($head as $k => $v) {
  449. if (!isset($found_keys[$k])) {
  450. $found_keys[$k] = 1;
  451. }
  452. }
  453. }
  454. // sort keys
  455. ksort($found_keys);
  456. $found = array();
  457. foreach ($found_keys as $k => $v) {
  458. $found[] = $this->dom->nodes[$k];
  459. }
  460. // return nth-element or array
  461. if (is_null($idx)) { return $found; }
  462. elseif ($idx < 0) { $idx = count($found) + $idx; }
  463. return (isset($found[$idx])) ? $found[$idx] : null;
  464. }
  465. protected function seek($selector, &$ret, $parent_cmd, $lowercase = false)
  466. {
  467. global $debug_object;
  468. if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
  469. list($tag, $id, $class, $attributes, $cmb) = $selector;
  470. $nodes = array();
  471. if ($parent_cmd === ' ') { // Descendant Combinator
  472. // Find parent closing tag if the current element doesn't have a closing
  473. // tag (i.e. void element)
  474. $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
  475. if ($end == 0) {
  476. $parent = $this->parent;
  477. while (!isset($parent->_[HDOM_INFO_END]) && $parent !== null) {
  478. $end -= 1;
  479. $parent = $parent->parent;
  480. }
  481. $end += $parent->_[HDOM_INFO_END];
  482. }
  483. // Get list of target nodes
  484. $nodes_start = $this->_[HDOM_INFO_BEGIN] + 1;
  485. $nodes_count = $end - $nodes_start;
  486. $nodes = array_slice($this->dom->nodes, $nodes_start, $nodes_count, true);
  487. } elseif ($parent_cmd === '>') { // Child Combinator
  488. $nodes = $this->children;
  489. } elseif ($parent_cmd === '+'
  490. && $this->parent
  491. && in_array($this, $this->parent->children)) { // Next-Sibling Combinator
  492. $index = array_search($this, $this->parent->children, true) + 1;
  493. if ($index < count($this->parent->children))
  494. $nodes[] = $this->parent->children[$index];
  495. } elseif ($parent_cmd === '~'
  496. && $this->parent
  497. && in_array($this, $this->parent->children)) { // Subsequent Sibling Combinator
  498. $index = array_search($this, $this->parent->children, true);
  499. $nodes = array_slice($this->parent->children, $index);
  500. }
  501. // Go throgh each element starting at this element until the end tag
  502. // Note: If this element is a void tag, any previous void element is
  503. // skipped.
  504. foreach($nodes as $node) {
  505. $pass = true;
  506. // Skip root nodes
  507. if(!$node->parent) {
  508. $pass = false;
  509. }
  510. // Handle 'text' selector
  511. if($pass && $tag === 'text' && $node->tag === 'text') {
  512. $ret[array_search($node, $this->dom->nodes, true)] = 1;
  513. unset($node);
  514. continue;
  515. }
  516. // Skip if node isn't a child node (i.e. text nodes)
  517. if($pass && !in_array($node, $node->parent->children, true)) {
  518. $pass = false;
  519. }
  520. // Skip if tag doesn't match
  521. if ($pass && $tag !== '' && $tag !== $node->tag && $tag !== '*') {
  522. $pass = false;
  523. }
  524. // Skip if ID doesn't exist
  525. if ($pass && $id !== '' && !isset($node->attr['id'])) {
  526. $pass = false;
  527. }
  528. // Check if ID matches
  529. if ($pass && $id !== '' && isset($node->attr['id'])) {
  530. // Note: Only consider the first ID (as browsers do)
  531. $node_id = explode(' ', trim($node->attr['id']))[0];
  532. if($id !== $node_id) { $pass = false; }
  533. }
  534. // Check if all class(es) exist
  535. if ($pass && $class !== '' && is_array($class) && !empty($class)) {
  536. if (isset($node->attr['class'])) {
  537. $node_classes = explode(' ', $node->attr['class']);
  538. if ($lowercase) {
  539. $node_classes = array_map('strtolower', $node_classes);
  540. }
  541. foreach($class as $c) {
  542. if(!in_array($c, $node_classes)) {
  543. $pass = false;
  544. break;
  545. }
  546. }
  547. } else {
  548. $pass = false;
  549. }
  550. }
  551. // Check attributes
  552. if ($pass
  553. && $attributes !== ''
  554. && is_array($attributes)
  555. && !empty($attributes)) {
  556. foreach($attributes as $a) {
  557. list (
  558. $att_name,
  559. $att_expr,
  560. $att_val,
  561. $att_inv,
  562. $att_case_sensitivity
  563. ) = $a;
  564. // Handle indexing attributes (i.e. "[2]")
  565. /**
  566. * Note: This is not supported by the CSS Standard but adds
  567. * the ability to select items compatible to XPath (i.e.
  568. * the 3rd element within it's parent).
  569. *
  570. * Note: This doesn't conflict with the CSS Standard which
  571. * doesn't work on numeric attributes anyway.
  572. */
  573. if (is_numeric($att_name)
  574. && $att_expr === ''
  575. && $att_val === '') {
  576. $count = 0;
  577. // Find index of current element in parent
  578. foreach ($node->parent->children as $c) {
  579. if ($c->tag === $node->tag) ++$count;
  580. if ($c === $node) break;
  581. }
  582. // If this is the correct node, continue with next
  583. // attribute
  584. if ($count === (int)$att_name) continue;
  585. }
  586. // Check attribute availability
  587. if ($att_inv) { // Attribute should NOT be set
  588. if (isset($node->attr[$att_name])) {
  589. $pass = false;
  590. break;
  591. }
  592. } else { // Attribute should be set
  593. // todo: "plaintext" is not a valid CSS selector!
  594. if ($att_name !== 'plaintext'
  595. && !isset($node->attr[$att_name])) {
  596. $pass = false;
  597. break;
  598. }
  599. }
  600. // Continue with next attribute if expression isn't defined
  601. if ($att_expr === '') continue;
  602. // If they have told us that this is a "plaintext"
  603. // search then we want the plaintext of the node - right?
  604. // todo "plaintext" is not a valid CSS selector!
  605. if ($att_name === 'plaintext') {
  606. $nodeKeyValue = $node->text();
  607. } else {
  608. $nodeKeyValue = $node->attr[$att_name];
  609. }
  610. if (is_object($debug_object)) {
  611. $debug_object->debug_log(2,
  612. 'testing node: '
  613. . $node->tag
  614. . ' for attribute: '
  615. . $att_name
  616. . $att_expr
  617. . $att_val
  618. . ' where nodes value is: '
  619. . $nodeKeyValue
  620. );
  621. }
  622. // If lowercase is set, do a case insensitive test of
  623. // the value of the selector.
  624. if ($lowercase) {
  625. $check = $this->match(
  626. $att_expr,
  627. strtolower($att_val),
  628. strtolower($nodeKeyValue),
  629. $att_case_sensitivity
  630. );
  631. } else {
  632. $check = $this->match(
  633. $att_expr,
  634. $att_val,
  635. $nodeKeyValue,
  636. $att_case_sensitivity
  637. );
  638. }
  639. if (is_object($debug_object)) {
  640. $debug_object->debug_log(2,
  641. 'after match: '
  642. . ($check ? 'true' : 'false')
  643. );
  644. }
  645. if (!$check) {
  646. $pass = false;
  647. break;
  648. }
  649. }
  650. }
  651. // Found a match. Add to list and clear node
  652. if ($pass) $ret[$node->_[HDOM_INFO_BEGIN]] = 1;
  653. unset($node);
  654. }
  655. // It's passed by reference so this is actually what this function returns.
  656. if (is_object($debug_object)) {
  657. $debug_object->debug_log(1, 'EXIT - ret: ', $ret);
  658. }
  659. }
  660. protected function match($exp, $pattern, $value, $case_sensitivity)
  661. {
  662. global $debug_object;
  663. if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
  664. if ($case_sensitivity === 'i') {
  665. $pattern = strtolower($pattern);
  666. $value = strtolower($value);
  667. }
  668. switch ($exp) {
  669. case '=':
  670. return ($value === $pattern);
  671. case '!=':
  672. return ($value !== $pattern);
  673. case '^=':
  674. return preg_match('/^' . preg_quote($pattern, '/') . '/', $value);
  675. case '$=':
  676. return preg_match('/' . preg_quote($pattern, '/') . '$/', $value);
  677. case '*=':
  678. return preg_match('/' . preg_quote($pattern, '/') . '/', $value);
  679. case '|=':
  680. /**
  681. * [att|=val]
  682. *
  683. * Represents an element with the att attribute, its value
  684. * either being exactly "val" or beginning with "val"
  685. * immediately followed by "-" (U+002D).
  686. */
  687. return strpos($value, $pattern) === 0;
  688. case '~=':
  689. /**
  690. * [att~=val]
  691. *
  692. * Represents an element with the att attribute whose value is a
  693. * whitespace-separated list of words, one of which is exactly
  694. * "val". If "val" contains whitespace, it will never represent
  695. * anything (since the words are separated by spaces). Also if
  696. * "val" is the empty string, it will never represent anything.
  697. */
  698. return in_array($pattern, explode(' ', trim($value)), true);
  699. }
  700. return false;
  701. }
  702. protected function parse_selector($selector_string)
  703. {
  704. global $debug_object;
  705. if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
  706. /**
  707. * Pattern of CSS selectors, modified from mootools (https://mootools.net/)
  708. *
  709. * Paperg: Add the colon to the attribute, so that it properly finds
  710. * <tag attr:ibute="something" > like google does.
  711. *
  712. * Note: if you try to look at this attribute, you MUST use getAttribute
  713. * since $dom->x:y will fail the php syntax check.
  714. *
  715. * Notice the \[ starting the attribute? and the @? following? This
  716. * implies that an attribute can begin with an @ sign that is not
  717. * captured. This implies that an html attribute specifier may start
  718. * with an @ sign that is NOT captured by the expression. Farther study
  719. * is required to determine of this should be documented or removed.
  720. *
  721. * Matches selectors in this order:
  722. *
  723. * [0] - full match
  724. *
  725. * [1] - tag name
  726. * ([\w:\*-]*)
  727. * Matches the tag name consisting of zero or more words, colons,
  728. * asterisks and hyphens.
  729. *
  730. * [2] - id name
  731. * (?:\#([\w-]+))
  732. * Optionally matches a id name, consisting of an "#" followed by
  733. * the id name (one or more words and hyphens).
  734. *
  735. * [3] - class names (including dots)
  736. * (?:\.([\w\.-]+))?
  737. * Optionally matches a list of classs, consisting of an "."
  738. * followed by the class name (one or more words and hyphens)
  739. * where multiple classes can be chained (i.e. ".foo.bar.baz")
  740. *
  741. * [4] - attributes
  742. * ((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?
  743. * Optionally matches the attributes list
  744. *
  745. * [5] - separator
  746. * ([\/, >+~]+)
  747. * Matches the selector list separator
  748. */
  749. // phpcs:ignore Generic.Files.LineLength
  750. $pattern = "/([\w:\*-]*)(?:\#([\w-]+))?(?:|\.([\w\.-]+))?((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?([\/, >+~]+)/is";
  751. preg_match_all(
  752. $pattern,
  753. trim($selector_string) . ' ', // Add final ' ' as pseudo separator
  754. $matches,
  755. PREG_SET_ORDER
  756. );
  757. if (is_object($debug_object)) {
  758. $debug_object->debug_log(2, 'Matches Array: ', $matches);
  759. }
  760. $selectors = array();
  761. $result = array();
  762. foreach ($matches as $m) {
  763. $m[0] = trim($m[0]);
  764. // Skip NoOps
  765. if ($m[0] === '' || $m[0] === '/' || $m[0] === '//') { continue; }
  766. // Convert to lowercase
  767. if ($this->dom->lowercase) {
  768. $m[1] = strtolower($m[1]);
  769. }
  770. // Extract classes
  771. if ($m[3] !== '') { $m[3] = explode('.', $m[3]); }
  772. /* Extract attributes (pattern based on the pattern above!)
  773. * [0] - full match
  774. * [1] - attribute name
  775. * [2] - attribute expression
  776. * [3] - attribute value
  777. * [4] - case sensitivity
  778. *
  779. * Note: Attributes can be negated with a "!" prefix to their name
  780. */
  781. if($m[4] !== '') {
  782. preg_match_all(
  783. "/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s+?([iIsS])?)?\]/is",
  784. trim($m[4]),
  785. $attributes,
  786. PREG_SET_ORDER
  787. );
  788. // Replace element by array
  789. $m[4] = array();
  790. foreach($attributes as $att) {
  791. // Skip empty matches
  792. if(trim($att[0]) === '') { continue; }
  793. $inverted = (isset($att[1][0]) && $att[1][0] === '!');
  794. $m[4][] = array(
  795. $inverted ? substr($att[1], 1) : $att[1], // Name
  796. (isset($att[2])) ? $att[2] : '', // Expression
  797. (isset($att[3])) ? $att[3] : '', // Value
  798. $inverted, // Inverted Flag
  799. (isset($att[4])) ? strtolower($att[4]) : '', // Case-Sensitivity
  800. );
  801. }
  802. }
  803. // Sanitize Separator
  804. if ($m[5] !== '' && trim($m[5]) === '') { // Descendant Separator
  805. $m[5] = ' ';
  806. } else { // Other Separator
  807. $m[5] = trim($m[5]);
  808. }
  809. // Clear Separator if it's a Selector List
  810. if ($is_list = ($m[5] === ',')) { $m[5] = ''; }
  811. // Remove full match before adding to results
  812. array_shift($m);
  813. $result[] = $m;
  814. if ($is_list) { // Selector List
  815. $selectors[] = $result;
  816. $result = array();
  817. }
  818. }
  819. if (count($result) > 0) { $selectors[] = $result; }
  820. return $selectors;
  821. }
  822. function __get($name)
  823. {
  824. if (isset($this->attr[$name])) {
  825. return $this->convert_text($this->attr[$name]);
  826. }
  827. switch ($name) {
  828. case 'outertext': return $this->outertext();
  829. case 'innertext': return $this->innertext();
  830. case 'plaintext': return $this->text();
  831. case 'xmltext': return $this->xmltext();
  832. default: return array_key_exists($name, $this->attr);
  833. }
  834. }
  835. function __set($name, $value)
  836. {
  837. global $debug_object;
  838. if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
  839. switch ($name) {
  840. case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
  841. case 'innertext':
  842. if (isset($this->_[HDOM_INFO_TEXT])) {
  843. return $this->_[HDOM_INFO_TEXT] = $value;
  844. }
  845. return $this->_[HDOM_INFO_INNER] = $value;
  846. }
  847. if (!isset($this->attr[$name])) {
  848. $this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
  849. $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
  850. }
  851. $this->attr[$name] = $value;
  852. }
  853. function __isset($name)
  854. {
  855. switch ($name) {
  856. case 'outertext': return true;
  857. case 'innertext': return true;
  858. case 'plaintext': return true;
  859. }
  860. //no value attr: nowrap, checked selected...
  861. return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
  862. }
  863. function __unset($name)
  864. {
  865. if (isset($this->attr[$name])) { unset($this->attr[$name]); }
  866. }
  867. function convert_text($text)
  868. {
  869. global $debug_object;
  870. if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
  871. $converted_text = $text;
  872. $sourceCharset = '';
  873. $targetCharset = '';
  874. if ($this->dom) {
  875. $sourceCharset = strtoupper($this->dom->_charset);
  876. $targetCharset = strtoupper($this->dom->_target_charset);
  877. }
  878. if (is_object($debug_object)) {
  879. $debug_object->debug_log(3,
  880. 'source charset: '
  881. . $sourceCharset
  882. . ' target charaset: '
  883. . $targetCharset
  884. );
  885. }
  886. if (!empty($sourceCharset)
  887. && !empty($targetCharset)
  888. && (strcasecmp($sourceCharset, $targetCharset) != 0)) {
  889. // Check if the reported encoding could have been incorrect and the text is actually already UTF-8
  890. if ((strcasecmp($targetCharset, 'UTF-8') == 0)
  891. && ($this->is_utf8($text))) {
  892. $converted_text = $text;
  893. } else {
  894. $converted_text = iconv($sourceCharset, $targetCharset, $text);
  895. }
  896. }
  897. // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.
  898. if ($targetCharset === 'UTF-8') {
  899. if (substr($converted_text, 0, 3) === "\xef\xbb\xbf") {
  900. $converted_text = substr($converted_text, 3);
  901. }
  902. if (substr($converted_text, -3) === "\xef\xbb\xbf") {
  903. $converted_text = substr($converted_text, 0, -3);
  904. }
  905. }
  906. return $converted_text;
  907. }
  908. static function is_utf8($str)
  909. {
  910. $c = 0; $b = 0;
  911. $bits = 0;
  912. $len = strlen($str);
  913. for($i = 0; $i < $len; $i++) {
  914. $c = ord($str[$i]);
  915. if($c > 128) {
  916. if(($c >= 254)) { return false; }
  917. elseif($c >= 252) { $bits = 6; }
  918. elseif($c >= 248) { $bits = 5; }
  919. elseif($c >= 240) { $bits = 4; }
  920. elseif($c >= 224) { $bits = 3; }
  921. elseif($c >= 192) { $bits = 2; }
  922. else { return false; }
  923. if(($i + $bits) > $len) { return false; }
  924. while($bits > 1) {
  925. $i++;
  926. $b = ord($str[$i]);
  927. if($b < 128 || $b > 191) { return false; }
  928. $bits--;
  929. }
  930. }
  931. }
  932. return true;
  933. }
  934. function get_display_size()
  935. {
  936. global $debug_object;
  937. $width = -1;
  938. $height = -1;
  939. if ($this->tag !== 'img') {
  940. return false;
  941. }
  942. // See if there is aheight or width attribute in the tag itself.
  943. if (isset($this->attr['width'])) {
  944. $width = $this->attr['width'];
  945. }
  946. if (isset($this->attr['height'])) {
  947. $height = $this->attr['height'];
  948. }
  949. // Now look for an inline style.
  950. if (isset($this->attr['style'])) {
  951. // Thanks to user gnarf from stackoverflow for this regular expression.
  952. $attributes = array();
  953. preg_match_all(
  954. '/([\w-]+)\s*:\s*([^;]+)\s*;?/',
  955. $this->attr['style'],
  956. $matches,
  957. PREG_SET_ORDER
  958. );
  959. foreach ($matches as $match) {
  960. $attributes[$match[1]] = $match[2];
  961. }
  962. // If there is a width in the style attributes:
  963. if (isset($attributes['width']) && $width == -1) {
  964. // check that the last two characters are px (pixels)
  965. if (strtolower(substr($attributes['width'], -2)) === 'px') {
  966. $proposed_width = substr($attributes['width'], 0, -2);
  967. // Now make sure that it's an integer and not something stupid.
  968. if (filter_var($proposed_width, FILTER_VALIDATE_INT)) {
  969. $width = $proposed_width;
  970. }
  971. }
  972. }
  973. // If there is a width in the style attributes:
  974. if (isset($attributes['height']) && $height == -1) {
  975. // check that the last two characters are px (pixels)
  976. if (strtolower(substr($attributes['height'], -2)) == 'px') {
  977. $proposed_height = substr($attributes['height'], 0, -2);
  978. // Now make sure that it's an integer and not something stupid.
  979. if (filter_var($proposed_height, FILTER_VALIDATE_INT)) {
  980. $height = $proposed_height;
  981. }
  982. }
  983. }
  984. }
  985. // Future enhancement:
  986. // Look in the tag to see if there is a class or id specified that has
  987. // a height or width attribute to it.
  988. // Far future enhancement
  989. // Look at all the parent tags of this image to see if they specify a
  990. // class or id that has an img selector that specifies a height or width
  991. // Note that in this case, the class or id will have the img subselector
  992. // for it to apply to the image.
  993. // ridiculously far future development
  994. // If the class or id is specified in a SEPARATE css file thats not on
  995. // the page, go get it and do what we were just doing for the ones on
  996. // the page.
  997. $result = array(
  998. 'height' => $height,
  999. 'width' => $width
  1000. );
  1001. return $result;
  1002. }
  1003. function save($filepath = '')
  1004. {
  1005. $ret = $this->outertext();
  1006. if ($filepath !== '') {
  1007. file_put_contents($filepath, $ret, LOCK_EX);
  1008. }
  1009. return $ret;
  1010. }
  1011. function addClass($class)
  1012. {
  1013. if (is_string($class)) {
  1014. $class = explode(' ', $class);
  1015. }
  1016. if (is_array($class)) {
  1017. foreach($class as $c) {
  1018. if (isset($this->class)) {
  1019. if ($this->hasClass($c)) {
  1020. continue;
  1021. } else {
  1022. $this->class .= ' ' . $c;
  1023. }
  1024. } else {
  1025. $this->class = $c;
  1026. }
  1027. }
  1028. } else {
  1029. if (is_object($debug_object)) {
  1030. $debug_object->debug_log(2, 'Invalid type: ', gettype($class));
  1031. }
  1032. }
  1033. }
  1034. function hasClass($class)
  1035. {
  1036. if (is_string($class)) {
  1037. if (isset($this->class)) {
  1038. return in_array($class, explode(' ', $this->class), true);
  1039. }
  1040. } else {
  1041. if (is_object($debug_object)) {
  1042. $debug_object->debug_log(2, 'Invalid type: ', gettype($class));
  1043. }
  1044. }
  1045. return false;
  1046. }
  1047. function removeClass($class = null)
  1048. {
  1049. if (!isset($this->class)) {
  1050. return;
  1051. }
  1052. if (is_null($class)) {
  1053. $this->removeAttribute('class');
  1054. return;
  1055. }
  1056. if (is_string($class)) {
  1057. $class = explode(' ', $class);
  1058. }
  1059. if (is_array($class)) {
  1060. $class = array_diff(explode(' ', $this->class), $class);
  1061. if (empty($class)) {
  1062. $this->removeAttribute('class');
  1063. } else {
  1064. $this->class = implode(' ', $class);
  1065. }
  1066. }
  1067. }
  1068. function getAllAttributes()
  1069. {
  1070. return $this->attr;
  1071. }
  1072. function getAttribute($name)
  1073. {
  1074. return $this->__get($name);
  1075. }
  1076. function setAttribute($name, $value)
  1077. {
  1078. $this->__set($name, $value);
  1079. }
  1080. function hasAttribute($name)
  1081. {
  1082. return $this->__isset($name);
  1083. }
  1084. function removeAttribute($name)
  1085. {
  1086. $this->__set($name, null);
  1087. }
  1088. function remove()
  1089. {
  1090. if ($this->parent) {
  1091. $this->parent->removeChild($this);
  1092. }
  1093. }
  1094. function removeChild($node)
  1095. {
  1096. $nidx = array_search($node, $this->nodes, true);
  1097. $cidx = array_search($node, $this->children, true);
  1098. $didx = array_search($node, $this->dom->nodes, true);
  1099. if ($nidx !== false && $cidx !== false && $didx !== false) {
  1100. foreach($node->children as $child) {
  1101. $node->removeChild($child);
  1102. }
  1103. foreach($node->nodes as $entity) {
  1104. $enidx = array_search($entity, $node->nodes, true);
  1105. $edidx = array_search($entity, $node->dom->nodes, true);
  1106. if ($enidx !== false && $edidx !== false) {
  1107. unset($node->nodes[$enidx]);
  1108. unset($node->dom->nodes[$edidx]);
  1109. }
  1110. }
  1111. unset($this->nodes[$nidx]);
  1112. unset($this->children[$cidx]);
  1113. unset($this->dom->nodes[$didx]);
  1114. $node->clear();
  1115. }
  1116. }
  1117. function getElementById($id)
  1118. {
  1119. return $this->find("#$id", 0);
  1120. }
  1121. function getElementsById($id, $idx = null)
  1122. {
  1123. return $this->find("#$id", $idx);
  1124. }
  1125. function getElementByTagName($name)
  1126. {
  1127. return $this->find($name, 0);
  1128. }
  1129. function getElementsByTagName($name, $idx = null)
  1130. {
  1131. return $this->find($name, $idx);
  1132. }
  1133. function parentNode()
  1134. {
  1135. return $this->parent();
  1136. }
  1137. function childNodes($idx = -1)
  1138. {
  1139. return $this->children($idx);
  1140. }
  1141. function firstChild()
  1142. {
  1143. return $this->first_child();
  1144. }
  1145. function lastChild()
  1146. {
  1147. return $this->last_child();
  1148. }
  1149. function nextSibling()
  1150. {
  1151. return $this->next_sibling();
  1152. }
  1153. function previousSibling()
  1154. {
  1155. return $this->prev_sibling();
  1156. }
  1157. function hasChildNodes()
  1158. {
  1159. return $this->has_child();
  1160. }
  1161. function nodeName()
  1162. {
  1163. return $this->tag;
  1164. }
  1165. function appendChild($node)
  1166. {
  1167. $node->parent($this);
  1168. return $node;
  1169. }
  1170. }
  1171. class simple_html_dom
  1172. {
  1173. public $root = null;
  1174. public $nodes = array();
  1175. public $callback = null;
  1176. public $lowercase = false;
  1177. public $original_size;
  1178. public $size;
  1179. protected $pos;
  1180. protected $doc;
  1181. protected $char;
  1182. protected $cursor;
  1183. protected $parent;
  1184. protected $noise = array();
  1185. protected $token_blank = " \t\r\n";
  1186. protected $token_equal = ' =/>';
  1187. protected $token_slash = " />\r\n\t";
  1188. protected $token_attr = ' >';
  1189. public $_charset = '';
  1190. public $_target_charset = '';
  1191. protected $default_br_text = '';
  1192. public $default_span_text = '';
  1193. protected $self_closing_tags = array(
  1194. 'area' => 1,
  1195. 'base' => 1,
  1196. 'br' => 1,
  1197. 'col' => 1,
  1198. 'embed' => 1,
  1199. 'hr' => 1,
  1200. 'img' => 1,
  1201. 'input' => 1,
  1202. 'link' => 1,
  1203. 'meta' => 1,
  1204. 'param' => 1,
  1205. 'source' => 1,
  1206. 'track' => 1,
  1207. 'wbr' => 1
  1208. );
  1209. protected $block_tags = array(
  1210. 'body' => 1,
  1211. 'div' => 1,
  1212. 'form' => 1,
  1213. 'root' => 1,
  1214. 'span' => 1,
  1215. 'table' => 1
  1216. );
  1217. protected $optional_closing_tags = array(
  1218. // Not optional, see
  1219. // https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element
  1220. 'b' => array('b' => 1),
  1221. 'dd' => array('dd' => 1, 'dt' => 1),
  1222. // Not optional, see
  1223. // https://www.w3.org/TR/html/grouping-content.html#the-dl-element
  1224. 'dl' => array('dd' => 1, 'dt' => 1),
  1225. 'dt' => array('dd' => 1, 'dt' => 1),
  1226. 'li' => array('li' => 1),
  1227. 'optgroup' => array('optgroup' => 1, 'option' => 1),
  1228. 'option' => array('optgroup' => 1, 'option' => 1),
  1229. 'p' => array('p' => 1),
  1230. 'rp' => array('rp' => 1, 'rt' => 1),
  1231. 'rt' => array('rp' => 1, 'rt' => 1),
  1232. 'td' => array('td' => 1, 'th' => 1),
  1233. 'th' => array('td' => 1, 'th' => 1),
  1234. 'tr' => array('td' => 1, 'th' => 1, 'tr' => 1),
  1235. );
  1236. function __construct(
  1237. $str = null,
  1238. $lowercase = true,
  1239. $forceTagsClosed = true,
  1240. $target_charset = DEFAULT_TARGET_CHARSET,
  1241. $stripRN = true,
  1242. $defaultBRText = DEFAULT_BR_TEXT,
  1243. $defaultSpanText = DEFAULT_SPAN_TEXT,
  1244. $options = 0)
  1245. {
  1246. if ($str) {
  1247. if (preg_match('/^http:\/\//i', $str) || is_file($str)) {
  1248. $this->load_file($str);
  1249. } else {
  1250. $this->load(
  1251. $str,
  1252. $lowercase,
  1253. $stripRN,
  1254. $defaultBRText,
  1255. $defaultSpanText,
  1256. $options
  1257. );
  1258. }
  1259. }
  1260. // Forcing tags to be closed implies that we don't trust the html, but
  1261. // it can lead to parsing errors if we SHOULD trust the html.
  1262. if (!$forceTagsClosed) {
  1263. $this->optional_closing_array = array();
  1264. }
  1265. $this->_target_charset = $target_charset;
  1266. }
  1267. function __destruct()
  1268. {
  1269. $this->clear();
  1270. }
  1271. function load(
  1272. $str,
  1273. $lowercase = true,
  1274. $stripRN = true,
  1275. $defaultBRText = DEFAULT_BR_TEXT,
  1276. $defaultSpanText = DEFAULT_SPAN_TEXT,
  1277. $options = 0)
  1278. {
  1279. global $debug_object;
  1280. // prepare
  1281. $this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText);
  1282. // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
  1283. // Script tags removal now preceeds style tag removal.
  1284. // strip out <script> tags
  1285. $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is");
  1286. $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is");
  1287. // strip out the \r \n's if we are told to.
  1288. if ($stripRN) {
  1289. $this->doc = str_replace("\r", ' ', $this->doc);
  1290. $this->doc = str_replace("\n", ' ', $this->doc);
  1291. // set the length of content since we have changed it.
  1292. $this->size = strlen($this->doc);
  1293. }
  1294. // strip out cdata
  1295. $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true);
  1296. // strip out comments
  1297. $this->remove_noise("'<!--(.*?)-->'is");
  1298. // strip out <style> tags
  1299. $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is");
  1300. $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is");
  1301. // strip out preformatted tags
  1302. $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is");
  1303. // strip out server side scripts
  1304. $this->remove_noise("'(<\?)(.*?)(\?>)'s", true);
  1305. if($options & HDOM_SMARTY_AS_TEXT) { // Strip Smarty scripts
  1306. $this->remove_noise("'(\{\w)(.*?)(\})'s", true);
  1307. }
  1308. // parsing
  1309. $this->parse();
  1310. // end
  1311. $this->root->_[HDOM_INFO_END] = $this->cursor;
  1312. $this->parse_charset();
  1313. // make load function chainable
  1314. return $this;
  1315. }
  1316. function load_file()
  1317. {
  1318. $args = func_get_args();
  1319. if(($doc = call_user_func_array('file_get_contents', $args)) !== false) {
  1320. $this->load($doc, true);
  1321. } else {
  1322. return false;
  1323. }
  1324. }
  1325. function set_callback($function_name)
  1326. {
  1327. $this->callback = $function_name;
  1328. }
  1329. function remove_callback()
  1330. {
  1331. $this->callback = null;
  1332. }
  1333. function save($filepath = '')
  1334. {
  1335. $ret = $this->root->innertext();
  1336. if ($filepath !== '') { file_put_contents($filepath, $ret, LOCK_EX); }
  1337. return $ret;
  1338. }
  1339. function find($selector, $idx = null, $lowercase = false)
  1340. {
  1341. return $this->root->find($selector, $idx, $lowercase);
  1342. }
  1343. function clear()
  1344. {
  1345. if (isset($this->nodes)) {
  1346. foreach ($this->nodes as $n) {
  1347. $n->clear();
  1348. $n = null;
  1349. }
  1350. }
  1351. // This add next line is documented in the sourceforge repository.
  1352. // 2977248 as a fix for ongoing memory leaks that occur even with the
  1353. // use of clear.
  1354. if (isset($this->children)) {
  1355. foreach ($this->children as $n) {
  1356. $n->clear();
  1357. $n = null;
  1358. }
  1359. }
  1360. if (isset($this->parent)) {
  1361. $this->parent->clear();
  1362. unset($this->parent);
  1363. }
  1364. if (isset($this->root)) {
  1365. $this->root->clear();
  1366. unset($this->root);
  1367. }
  1368. unset($this->doc);
  1369. unset($this->noise);
  1370. }
  1371. function dump($show_attr = true)
  1372. {
  1373. $this->root->dump($show_attr);
  1374. }
  1375. protected function prepare(
  1376. $str, $lowercase = true,
  1377. $defaultBRText = DEFAULT_BR_TEXT,
  1378. $defaultSpanText = DEFAULT_SPAN_TEXT)
  1379. {
  1380. $this->clear();
  1381. $this->doc = trim($str);
  1382. $this->size = strlen($this->doc);
  1383. $this->original_size = $this->size; // original size of the html
  1384. $this->pos = 0;
  1385. $this->cursor = 1;
  1386. $this->noise = array();
  1387. $this->nodes = array();
  1388. $this->lowercase = $lowercase;
  1389. $this->default_br_text = $defaultBRText;
  1390. $this->default_span_text = $defaultSpanText;
  1391. $this->root = new simple_html_dom_node($this);
  1392. $this->root->tag = 'root';
  1393. $this->root->_[HDOM_INFO_BEGIN] = -1;
  1394. $this->root->nodetype = HDOM_TYPE_ROOT;
  1395. $this->parent = $this->root;
  1396. if ($this->size > 0) { $this->char = $this->doc[0]; }
  1397. }
  1398. protected function parse()
  1399. {
  1400. while (true) {
  1401. // Read next tag if there is no text between current position and the
  1402. // next opening tag.
  1403. if (($s = $this->copy_until_char('<')) === '') {
  1404. if($this->read_tag()) {
  1405. continue;
  1406. } else {
  1407. return true;
  1408. }
  1409. }
  1410. // Add a text node for text between tags
  1411. $node = new simple_html_dom_node($this);
  1412. ++$this->cursor;
  1413. $node->_[HDOM_INFO_TEXT] = $s;
  1414. $this->link_nodes($node, false);
  1415. }
  1416. }
  1417. protected function parse_charset()
  1418. {
  1419. global $debug_object;
  1420. $charset = null;
  1421. if (function_exists('get_last_retrieve_url_contents_content_type')) {
  1422. $contentTypeHeader = get_last_retrieve_url_contents_content_type();
  1423. $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches);
  1424. if ($success) {
  1425. $charset = $matches[1];
  1426. if (is_object($debug_object)) {
  1427. $debug_object->debug_log(2,
  1428. 'header content-type found charset of: '
  1429. . $charset
  1430. );
  1431. }
  1432. }
  1433. }
  1434. if (empty($charset)) {
  1435. // https://www.w3.org/TR/html/document-metadata.html#statedef-http-equiv-content-type
  1436. $el = $this->root->find('meta[http-equiv=Content-Type]', 0, true);
  1437. if (!empty($el)) {
  1438. $fullvalue = $el->content;
  1439. if (is_object($debug_object)) {
  1440. $debug_object->debug_log(2,
  1441. 'meta content-type tag found'
  1442. . $fullvalue
  1443. );
  1444. }
  1445. if (!empty($fullvalue)) {
  1446. $success = preg_match(
  1447. '/charset=(.+)/i',
  1448. $fullvalue,
  1449. $matches
  1450. );
  1451. if ($success) {
  1452. $charset = $matches[1];
  1453. } else {
  1454. // If there is a meta tag, and they don't specify the
  1455. // character set, research says that it's typically
  1456. // ISO-8859-1
  1457. if (is_object($debug_object)) {
  1458. $debug_object->debug_log(2,
  1459. 'meta content-type tag couldn\'t be parsed. using iso-8859 default.'
  1460. );
  1461. }
  1462. $charset = 'ISO-8859-1';
  1463. }
  1464. }
  1465. }
  1466. }
  1467. if (empty($charset)) {
  1468. // https://www.w3.org/TR/html/document-metadata.html#character-encoding-declaration
  1469. if ($meta = $this->root->find('meta[charset]', 0)) {
  1470. $charset = $meta->charset;
  1471. if (is_object($debug_object)) {
  1472. $debug_object->debug_log(2, 'meta charset: ' . $charset);
  1473. }
  1474. }
  1475. }
  1476. if (empty($charset)) {
  1477. // Try to guess the charset based on the content
  1478. // Requires Multibyte String (mbstring) support (optional)
  1479. if (function_exists('mb_detect_encoding')) {
  1480. /**
  1481. * mb_detect_encoding() is not intended to distinguish between
  1482. * charsets, especially single-byte charsets. Its primary
  1483. * purpose is to detect which multibyte encoding is in use,
  1484. * i.e. UTF-8, UTF-16, shift-JIS, etc.
  1485. *
  1486. * -- https://bugs.php.net/bug.php?id=38138
  1487. *
  1488. * Adding both CP1251/ISO-8859-5 and CP1252/ISO-8859-1 will
  1489. * always result in CP1251/ISO-8859-5 and vice versa.
  1490. *
  1491. * Thus, only detect if it's either UTF-8 or CP1252/ISO-8859-1
  1492. * to stay compatible.
  1493. */
  1494. $encoding = mb_detect_encoding(
  1495. $this->doc,
  1496. array( 'UTF-8', 'CP1252', 'ISO-8859-1' )
  1497. );
  1498. if ($encoding === 'CP1252' || $encoding === 'ISO-8859-1') {
  1499. // Due to a limitation of mb_detect_encoding
  1500. // 'CP1251'/'ISO-8859-5' will be detected as
  1501. // 'CP1252'/'ISO-8859-1'. This will cause iconv to fail, in
  1502. // which case we can simply assume it is the other charset.
  1503. if (!@iconv('CP1252', 'UTF-8', $this->doc)) {
  1504. $encoding = 'CP1251';
  1505. }
  1506. }
  1507. if ($encoding !== false) {
  1508. $charset = $encoding;
  1509. if (is_object($debug_object)) {
  1510. $debug_object->debug_log(2, 'mb_detect: ' . $charset);
  1511. }
  1512. }
  1513. }
  1514. }
  1515. if (empty($charset)) {
  1516. // Assume it's UTF-8 as it is the most likely charset to be used
  1517. $charset = 'UTF-8';
  1518. if (is_object($debug_object)) {
  1519. $debug_object->debug_log(2, 'No match found, assume ' . $charset);
  1520. }
  1521. }
  1522. // Since CP1252 is a superset, if we get one of it's subsets, we want
  1523. // it instead.
  1524. if ((strtolower($charset) == 'iso-8859-1')
  1525. || (strtolower($charset) == 'latin1')
  1526. || (strtolower($charset) == 'latin-1')) {
  1527. $charset = 'CP1252';
  1528. if (is_object($debug_object)) {
  1529. $debug_object->debug_log(2,
  1530. 'replacing ' . $charset . ' with CP1252 as its a superset'
  1531. );
  1532. }
  1533. }
  1534. if (is_object($debug_object)) {
  1535. $debug_object->debug_log(1, 'EXIT - ' . $charset);
  1536. }
  1537. return $this->_charset = $charset;
  1538. }
  1539. protected function read_tag()
  1540. {
  1541. // Set end position if no further tags found
  1542. if ($this->char !== '<') {
  1543. $this->root->_[HDOM_INFO_END] = $this->cursor;
  1544. return false;
  1545. }
  1546. $begin_tag_pos = $this->pos;
  1547. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1548. // end tag
  1549. if ($this->char === '/') {
  1550. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1551. // Skip whitespace in end tags (i.e. in "</ html>")
  1552. $this->skip($this->token_blank);
  1553. $tag = $this->copy_until_char('>');
  1554. // Skip attributes in end tags
  1555. if (($pos = strpos($tag, ' ')) !== false) {
  1556. $tag = substr($tag, 0, $pos);
  1557. }
  1558. $parent_lower = strtolower($this->parent->tag);
  1559. $tag_lower = strtolower($tag);
  1560. // The end tag is supposed to close the parent tag. Handle situations
  1561. // when it doesn't
  1562. if ($parent_lower !== $tag_lower) {
  1563. // Parent tag does not have to be closed necessarily (optional closing tag)
  1564. // Current tag is a block tag, so it may close an ancestor
  1565. if (isset($this->optional_closing_tags[$parent_lower])
  1566. && isset($this->block_tags[$tag_lower])) {
  1567. $this->parent->_[HDOM_INFO_END] = 0;
  1568. $org_parent = $this->parent;
  1569. // Traverse ancestors to find a matching opening tag
  1570. // Stop at root node
  1571. while (($this->parent->parent)
  1572. && strtolower($this->parent->tag) !== $tag_lower
  1573. ){
  1574. $this->parent = $this->parent->parent;
  1575. }
  1576. // If we don't have a match add current tag as text node
  1577. if (strtolower($this->parent->tag) !== $tag_lower) {
  1578. $this->parent = $org_parent; // restore origonal parent
  1579. if ($this->parent->parent) {
  1580. $this->parent = $this->parent->parent;
  1581. }
  1582. $this->parent->_[HDOM_INFO_END] = $this->cursor;
  1583. return $this->as_text_node($tag);
  1584. }
  1585. } elseif (($this->parent->parent)
  1586. && isset($this->block_tags[$tag_lower])
  1587. ) {
  1588. // Grandparent exists and current tag is a block tag, so our
  1589. // parent doesn't have an end tag
  1590. $this->parent->_[HDOM_INFO_END] = 0; // No end tag
  1591. $org_parent = $this->parent;
  1592. // Traverse ancestors to find a matching opening tag
  1593. // Stop at root node
  1594. while (($this->parent->parent)
  1595. && strtolower($this->parent->tag) !== $tag_lower
  1596. ) {
  1597. $this->parent = $this->parent->parent;
  1598. }
  1599. // If we don't have a match add current tag as text node
  1600. if (strtolower($this->parent->tag) !== $tag_lower) {
  1601. $this->parent = $org_parent; // restore origonal parent
  1602. $this->parent->_[HDOM_INFO_END] = $this->cursor;
  1603. return $this->as_text_node($tag);
  1604. }
  1605. } elseif (($this->parent->parent)
  1606. && strtolower($this->parent->parent->tag) === $tag_lower
  1607. ) { // Grandparent exists and current tag closes it
  1608. $this->parent->_[HDOM_INFO_END] = 0;
  1609. $this->parent = $this->parent->parent;
  1610. } else { // Random tag, add as text node
  1611. return $this->as_text_node($tag);
  1612. }
  1613. }
  1614. // Set end position of parent tag to current cursor position
  1615. $this->parent->_[HDOM_INFO_END] = $this->cursor;
  1616. if ($this->parent->parent) {
  1617. $this->parent = $this->parent->parent;
  1618. }
  1619. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1620. return true;
  1621. }
  1622. // start tag
  1623. $node = new simple_html_dom_node($this);
  1624. $node->_[HDOM_INFO_BEGIN] = $this->cursor;
  1625. ++$this->cursor;
  1626. $tag = $this->copy_until($this->token_slash); // Get tag name
  1627. $node->tag_start = $begin_tag_pos;
  1628. // doctype, cdata & comments...
  1629. // <!DOCTYPE html>
  1630. // <![CDATA[ ... ]]>
  1631. // <!-- Comment -->
  1632. if (isset($tag[0]) && $tag[0] === '!') {
  1633. $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');
  1634. if (isset($tag[2]) && $tag[1] === '-' && $tag[2] === '-') { // Comment ("<!--")
  1635. $node->nodetype = HDOM_TYPE_COMMENT;
  1636. $node->tag = 'comment';
  1637. } else { // Could be doctype or CDATA but we don't care
  1638. $node->nodetype = HDOM_TYPE_UNKNOWN;
  1639. $node->tag = 'unknown';
  1640. }
  1641. if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; }
  1642. $this->link_nodes($node, true);
  1643. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1644. return true;
  1645. }
  1646. // The start tag cannot contain another start tag, if so add as text
  1647. // i.e. "<<html>"
  1648. if ($pos = strpos($tag, '<') !== false) {
  1649. $tag = '<' . substr($tag, 0, -1);
  1650. $node->_[HDOM_INFO_TEXT] = $tag;
  1651. $this->link_nodes($node, false);
  1652. $this->char = $this->doc[--$this->pos]; // prev
  1653. return true;
  1654. }
  1655. // Handle invalid tag names (i.e. "<html#doc>")
  1656. if (!preg_match('/^\w[\w:-]*$/', $tag)) {
  1657. $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>');
  1658. // Next char is the beginning of a new tag, don't touch it.
  1659. if ($this->char === '<') {
  1660. $this->link_nodes($node, false);
  1661. return true;
  1662. }
  1663. // Next char closes current tag, add and be done with it.
  1664. if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; }
  1665. $this->link_nodes($node, false);
  1666. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1667. return true;
  1668. }
  1669. // begin tag, add new node
  1670. $node->nodetype = HDOM_TYPE_ELEMENT;
  1671. $tag_lower = strtolower($tag);
  1672. $node->tag = ($this->lowercase) ? $tag_lower : $tag;
  1673. // handle optional closing tags
  1674. if (isset($this->optional_closing_tags[$tag_lower])) {
  1675. // Traverse ancestors to close all optional closing tags
  1676. while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) {
  1677. $this->parent->_[HDOM_INFO_END] = 0;
  1678. $this->parent = $this->parent->parent;
  1679. }
  1680. $node->parent = $this->parent;
  1681. }
  1682. $guard = 0; // prevent infinity loop
  1683. // [0] Space between tag and first attribute
  1684. $space = array($this->copy_skip($this->token_blank), '', '');
  1685. // attributes
  1686. do {
  1687. // Everything until the first equal sign should be the attribute name
  1688. $name = $this->copy_until($this->token_equal);
  1689. if ($name === '' && $this->char !== null && $space[0] === '') {
  1690. break;
  1691. }
  1692. if ($guard === $this->pos) { // Escape infinite loop
  1693. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1694. continue;
  1695. }
  1696. $guard = $this->pos;
  1697. // handle endless '<'
  1698. // Out of bounds before the tag ended
  1699. if ($this->pos >= $this->size - 1 && $this->char !== '>') {
  1700. $node->nodetype = HDOM_TYPE_TEXT;
  1701. $node->_[HDOM_INFO_END] = 0;
  1702. $node->_[HDOM_INFO_TEXT] = '<' . $tag . $space[0] . $name;
  1703. $node->tag = 'text';
  1704. $this->link_nodes($node, false);
  1705. return true;
  1706. }
  1707. // handle mismatch '<'
  1708. // Attributes cannot start after opening tag
  1709. if ($this->doc[$this->pos - 1] == '<') {
  1710. $node->nodetype = HDOM_TYPE_TEXT;
  1711. $node->tag = 'text';
  1712. $node->attr = array();
  1713. $node->_[HDOM_INFO_END] = 0;
  1714. $node->_[HDOM_INFO_TEXT] = substr(
  1715. $this->doc,
  1716. $begin_tag_pos,
  1717. $this->pos - $begin_tag_pos - 1
  1718. );
  1719. $this->pos -= 2;
  1720. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1721. $this->link_nodes($node, false);
  1722. return true;
  1723. }
  1724. if ($name !== '/' && $name !== '') { // this is a attribute name
  1725. // [1] Whitespace after attribute name
  1726. $space[1] = $this->copy_skip($this->token_blank);
  1727. $name = $this->restore_noise($name); // might be a noisy name
  1728. if ($this->lowercase) { $name = strtolower($name); }
  1729. if ($this->char === '=') { // attribute with value
  1730. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1731. $this->parse_attr($node, $name, $space); // get attribute value
  1732. } else {
  1733. //no value attr: nowrap, checked selected...
  1734. $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
  1735. $node->attr[$name] = true;
  1736. if ($this->char != '>') { $this->char = $this->doc[--$this->pos]; } // prev
  1737. }
  1738. $node->_[HDOM_INFO_SPACE][] = $space;
  1739. // prepare for next attribute
  1740. $space = array(
  1741. $this->copy_skip($this->token_blank),
  1742. '',
  1743. ''
  1744. );
  1745. } else { // no more attributes
  1746. break;
  1747. }
  1748. } while ($this->char !== '>' && $this->char !== '/'); // go until the tag ended
  1749. $this->link_nodes($node, true);
  1750. $node->_[HDOM_INFO_ENDSPACE] = $space[0];
  1751. // handle empty tags (i.e. "<div/>")
  1752. if ($this->copy_until_char('>') === '/') {
  1753. $node->_[HDOM_INFO_ENDSPACE] .= '/';
  1754. $node->_[HDOM_INFO_END] = 0;
  1755. } else {
  1756. // reset parent
  1757. if (!isset($this->self_closing_tags[strtolower($node->tag)])) {
  1758. $this->parent = $node;
  1759. }
  1760. }
  1761. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1762. // If it's a BR tag, we need to set it's text to the default text.
  1763. // This way when we see it in plaintext, we can generate formatting that the user wants.
  1764. // since a br tag never has sub nodes, this works well.
  1765. if ($node->tag === 'br') {
  1766. $node->_[HDOM_INFO_INNER] = $this->default_br_text;
  1767. }
  1768. return true;
  1769. }
  1770. protected function parse_attr($node, $name, &$space)
  1771. {
  1772. $is_duplicate = isset($node->attr[$name]);
  1773. if (!$is_duplicate) // Copy whitespace between "=" and value
  1774. $space[2] = $this->copy_skip($this->token_blank);
  1775. switch ($this->char) {
  1776. case '"':
  1777. $quote_type = HDOM_QUOTE_DOUBLE;
  1778. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1779. $value = $this->copy_until_char('"');
  1780. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1781. break;
  1782. case '\'':
  1783. $quote_type = HDOM_QUOTE_SINGLE;
  1784. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1785. $value = $this->copy_until_char('\'');
  1786. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1787. break;
  1788. default:
  1789. $quote_type = HDOM_QUOTE_NO;
  1790. $value = $this->copy_until($this->token_attr);
  1791. }
  1792. $value = $this->restore_noise($value);
  1793. // PaperG: Attributes should not have \r or \n in them, that counts as
  1794. // html whitespace.
  1795. $value = str_replace("\r", '', $value);
  1796. $value = str_replace("\n", '', $value);
  1797. // PaperG: If this is a "class" selector, lets get rid of the preceeding
  1798. // and trailing space since some people leave it in the multi class case.
  1799. if ($name === 'class') {
  1800. $value = trim($value);
  1801. }
  1802. if (!$is_duplicate) {
  1803. $node->_[HDOM_INFO_QUOTE][] = $quote_type;
  1804. $node->attr[$name] = $value;
  1805. }
  1806. }
  1807. protected function link_nodes(&$node, $is_child)
  1808. {
  1809. $node->parent = $this->parent;
  1810. $this->parent->nodes[] = $node;
  1811. if ($is_child) {
  1812. $this->parent->children[] = $node;
  1813. }
  1814. }
  1815. protected function as_text_node($tag)
  1816. {
  1817. $node = new simple_html_dom_node($this);
  1818. ++$this->cursor;
  1819. $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>';
  1820. $this->link_nodes($node, false);
  1821. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1822. return true;
  1823. }
  1824. protected function skip($chars)
  1825. {
  1826. $this->pos += strspn($this->doc, $chars, $this->pos);
  1827. $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1828. }
  1829. protected function copy_skip($chars)
  1830. {
  1831. $pos = $this->pos;
  1832. $len = strspn($this->doc, $chars, $pos);
  1833. $this->pos += $len;
  1834. $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1835. if ($len === 0) { return ''; }
  1836. return substr($this->doc, $pos, $len);
  1837. }
  1838. protected function copy_until($chars)
  1839. {
  1840. $pos = $this->pos;
  1841. $len = strcspn($this->doc, $chars, $pos);
  1842. $this->pos += $len;
  1843. $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1844. return substr($this->doc, $pos, $len);
  1845. }
  1846. protected function copy_until_char($char)
  1847. {
  1848. if ($this->char === null) { return ''; }
  1849. if (($pos = strpos($this->doc, $char, $this->pos)) === false) {
  1850. $ret = substr($this->doc, $this->pos, $this->size - $this->pos);
  1851. $this->char = null;
  1852. $this->pos = $this->size;
  1853. return $ret;
  1854. }
  1855. if ($pos === $this->pos) { return ''; }
  1856. $pos_old = $this->pos;
  1857. $this->char = $this->doc[$pos];
  1858. $this->pos = $pos;
  1859. return substr($this->doc, $pos_old, $pos - $pos_old);
  1860. }
  1861. protected function remove_noise($pattern, $remove_tag = false)
  1862. {
  1863. global $debug_object;
  1864. if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
  1865. $count = preg_match_all(
  1866. $pattern,
  1867. $this->doc,
  1868. $matches,
  1869. PREG_SET_ORDER | PREG_OFFSET_CAPTURE
  1870. );
  1871. for ($i = $count - 1; $i > -1; --$i) {
  1872. $key = '___noise___' . sprintf('% 5d', count($this->noise) + 1000);
  1873. if (is_object($debug_object)) {
  1874. $debug_object->debug_log(2, 'key is: ' . $key);
  1875. }
  1876. $idx = ($remove_tag) ? 0 : 1; // 0 = entire match, 1 = submatch
  1877. $this->noise[$key] = $matches[$i][$idx][0];
  1878. $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));
  1879. }
  1880. // reset the length of content
  1881. $this->size = strlen($this->doc);
  1882. if ($this->size > 0) {
  1883. $this->char = $this->doc[0];
  1884. }
  1885. }
  1886. function restore_noise($text)
  1887. {
  1888. global $debug_object;
  1889. if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
  1890. while (($pos = strpos($text, '___noise___')) !== false) {
  1891. // Sometimes there is a broken piece of markup, and we don't GET the
  1892. // pos+11 etc... token which indicates a problem outside of us...
  1893. // todo: "___noise___1000" (or any number with four or more digits)
  1894. // in the DOM causes an infinite loop which could be utilized by
  1895. // malicious software
  1896. if (strlen($text) > $pos + 15) {
  1897. $key = '___noise___'
  1898. . $text[$pos + 11]
  1899. . $text[$pos + 12]
  1900. . $text[$pos + 13]
  1901. . $text[$pos + 14]
  1902. . $text[$pos + 15];
  1903. if (is_object($debug_object)) {
  1904. $debug_object->debug_log(2, 'located key of: ' . $key);
  1905. }
  1906. if (isset($this->noise[$key])) {
  1907. $text = substr($text, 0, $pos)
  1908. . $this->noise[$key]
  1909. . substr($text, $pos + 16);
  1910. } else {
  1911. // do this to prevent an infinite loop.
  1912. $text = substr($text, 0, $pos)
  1913. . 'UNDEFINED NOISE FOR KEY: '
  1914. . $key
  1915. . substr($text, $pos + 16);
  1916. }
  1917. } else {
  1918. // There is no valid key being given back to us... We must get
  1919. // rid of the ___noise___ or we will have a problem.
  1920. $text = substr($text, 0, $pos)
  1921. . 'NO NUMERIC NOISE KEY'
  1922. . substr($text, $pos + 11);
  1923. }
  1924. }
  1925. return $text;
  1926. }
  1927. function search_noise($text)
  1928. {
  1929. global $debug_object;
  1930. if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
  1931. foreach($this->noise as $noiseElement) {
  1932. if (strpos($noiseElement, $text) !== false) {
  1933. return $noiseElement;
  1934. }
  1935. }
  1936. }
  1937. function __toString()
  1938. {
  1939. return $this->root->innertext();
  1940. }
  1941. function __get($name)
  1942. {
  1943. switch ($name) {
  1944. case 'outertext':
  1945. return $this->root->innertext();
  1946. case 'innertext':
  1947. return $this->root->innertext();
  1948. case 'plaintext':
  1949. return $this->root->text();
  1950. case 'charset':
  1951. return $this->_charset;
  1952. case 'target_charset':
  1953. return $this->_target_charset;
  1954. }
  1955. }
  1956. function childNodes($idx = -1)
  1957. {
  1958. return $this->root->childNodes($idx);
  1959. }
  1960. function firstChild()
  1961. {
  1962. return $this->root->first_child();
  1963. }
  1964. function lastChild()
  1965. {
  1966. return $this->root->last_child();
  1967. }
  1968. function createElement($name, $value = null)
  1969. {
  1970. return @str_get_html("<$name>$value</$name>")->firstChild();
  1971. }
  1972. function createTextNode($value)
  1973. {
  1974. return @end(str_get_html($value)->nodes);
  1975. }
  1976. function getElementById($id)
  1977. {
  1978. return $this->find("#$id", 0);
  1979. }
  1980. function getElementsById($id, $idx = null)
  1981. {
  1982. return $this->find("#$id", $idx);
  1983. }
  1984. function getElementByTagName($name)
  1985. {
  1986. return $this->find($name, 0);
  1987. }
  1988. function getElementsByTagName($name, $idx = -1)
  1989. {
  1990. return $this->find($name, $idx);
  1991. }
  1992. function loadFile()
  1993. {
  1994. $args = func_get_args();
  1995. $this->load_file($args);
  1996. }
  1997. }