fuckhtml.php 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610
  1. <?php
  2. class fuckhtml{
  3. public function __construct($html = null, $isfile = false){
  4. if($html !== null){
  5. $this->load($html, $isfile);
  6. }
  7. }
  8. public function load($html, $isfile = false){
  9. if(is_array($html)){
  10. if(!isset($html["innerHTML"])){
  11. throw new Exception("(load) Supplied array doesn't contain an innerHTML index");
  12. }
  13. $html = $html["innerHTML"];
  14. }
  15. if($isfile){
  16. $handle = fopen($html, "r");
  17. $fetch = fread($handle, filesize($html));
  18. fclose($handle);
  19. $this->html = $fetch;
  20. }else{
  21. $this->html = $html;
  22. }
  23. $this->strlen = strlen($this->html);
  24. }
  25. public function getloadedhtml(){
  26. return $this->html;
  27. }
  28. public function getElementsByTagName(string $tagname){
  29. $out = [];
  30. /*
  31. Scrape start of the tag. Example
  32. <div class="mydiv"> ...
  33. */
  34. if($tagname == "*"){
  35. $tagname = '[A-Za-z0-9._-]+';
  36. }else{
  37. $tagname = preg_quote(strtolower($tagname));
  38. }
  39. preg_match_all(
  40. '/<\s*(' . $tagname . ')(\s(?:[^>\'"]*|"[^"]*"|\'[^\']*\')+)?\s*>/i',
  41. /* '/<\s*(' . $tagname . ')(\s[\S\s]*?)?>/i', */
  42. $this->html,
  43. $starting_tags,
  44. PREG_OFFSET_CAPTURE
  45. );
  46. for($i=0; $i<count($starting_tags[0]); $i++){
  47. /*
  48. Parse attributes
  49. */
  50. $attributes = [];
  51. preg_match_all(
  52. '/([^\/\s\\=]+)(?:\s*=\s*("[^"]*"|\'[^\']*\'|[^\s]*))?/i',
  53. $starting_tags[2][$i][0],
  54. $regex_attributes
  55. );
  56. for($k=0; $k<count($regex_attributes[0]); $k++){
  57. if(trim($regex_attributes[2][$k]) == ""){
  58. $attributes[$regex_attributes[1][$k]] =
  59. "true";
  60. continue;
  61. }
  62. $attributes[strtolower($regex_attributes[1][$k])] =
  63. trim($regex_attributes[2][$k], "'\" \n\r\t\v\x00");
  64. }
  65. $out[] = [
  66. "tagName" => strtolower($starting_tags[1][$i][0]),
  67. "startPos" => $starting_tags[0][$i][1],
  68. "endPos" => 0,
  69. "startTag" => $starting_tags[0][$i][0],
  70. "attributes" => $attributes,
  71. "innerHTML" => null
  72. ];
  73. }
  74. /*
  75. Get innerHTML
  76. */
  77. // get closing tag positions
  78. preg_match_all(
  79. '/<\s*\/\s*(' . $tagname . ')\s*>/i',
  80. $this->html,
  81. $regex_closing_tags,
  82. PREG_OFFSET_CAPTURE
  83. );
  84. // merge opening and closing tags together
  85. for($i=0; $i<count($regex_closing_tags[1]); $i++){
  86. $out[] = [
  87. "tagName" => strtolower($regex_closing_tags[1][$i][0]),
  88. "endTag" => $regex_closing_tags[0][$i][0],
  89. "startPos" => $regex_closing_tags[0][$i][1]
  90. ];
  91. }
  92. usort(
  93. $out,
  94. function($a, $b){
  95. return $a["startPos"] > $b["startPos"];
  96. }
  97. );
  98. // compute the indent level for each element
  99. $level = [];
  100. $count = count($out);
  101. for($i=0; $i<$count; $i++){
  102. if(!isset($level[$out[$i]["tagName"]])){
  103. $level[$out[$i]["tagName"]] = 0;
  104. }
  105. if(isset($out[$i]["startTag"])){
  106. // encountered starting tag
  107. $level[$out[$i]["tagName"]]++;
  108. $out[$i]["level"] = $level[$out[$i]["tagName"]];
  109. }else{
  110. // encountered closing tag
  111. $out[$i]["level"] = $level[$out[$i]["tagName"]];
  112. $level[$out[$i]["tagName"]]--;
  113. }
  114. }
  115. // if the indent level is the same for a div,
  116. // we encountered _THE_ closing tag
  117. for($i=0; $i<$count; $i++){
  118. if(!isset($out[$i]["startTag"])){
  119. continue;
  120. }
  121. for($k=$i; $k<$count; $k++){
  122. if(
  123. isset($out[$k]["endTag"]) &&
  124. $out[$i]["tagName"] == $out[$k]["tagName"] &&
  125. $out[$i]["level"]
  126. === $out[$k]["level"]
  127. ){
  128. $startlen = strlen($out[$i]["startTag"]);
  129. $endlen = strlen($out[$k]["endTag"]);
  130. $out[$i]["endPos"] = $out[$k]["startPos"] + $endlen;
  131. $out[$i]["innerHTML"] =
  132. substr(
  133. $this->html,
  134. $out[$i]["startPos"] + $startlen,
  135. $out[$k]["startPos"] - ($out[$i]["startPos"] + $startlen)
  136. );
  137. $out[$i]["outerHTML"] =
  138. substr(
  139. $this->html,
  140. $out[$i]["startPos"],
  141. $out[$k]["startPos"] - $out[$i]["startPos"] + $endlen
  142. );
  143. break;
  144. }
  145. }
  146. }
  147. // filter out ending divs
  148. for($i=0; $i<$count; $i++){
  149. if(isset($out[$i]["endTag"])){
  150. unset($out[$i]);
  151. }
  152. unset($out[$i]["startTag"]);
  153. }
  154. return array_values($out);
  155. }
  156. public function getElementsByAttributeName(string $name, $collection = null){
  157. if($collection === null){
  158. $collection = $this->getElementsByTagName("*");
  159. }elseif(is_string($collection)){
  160. $collection = $this->getElementsByTagName($collection);
  161. }
  162. $return = [];
  163. foreach($collection as $elem){
  164. foreach($elem["attributes"] as $attrib_name => $attrib_value){
  165. if($attrib_name == $name){
  166. $return[] = $elem;
  167. continue 2;
  168. }
  169. }
  170. }
  171. return $return;
  172. }
  173. public function getElementsByFuzzyAttributeValue(string $name, string $value, $collection = null){
  174. $elems = $this->getElementsByAttributeName($name, $collection);
  175. $value =
  176. explode(
  177. " ",
  178. trim(
  179. preg_replace(
  180. '/ +/',
  181. " ",
  182. $value
  183. )
  184. )
  185. );
  186. $return = [];
  187. foreach($elems as $elem){
  188. foreach($elem["attributes"] as $attrib_name => $attrib_value){
  189. $attrib_value = explode(" ", $attrib_value);
  190. $ac = count($attrib_value);
  191. $nc = count($value);
  192. $cr = 0;
  193. for($i=0; $i<$nc; $i++){
  194. for($k=0; $k<$ac; $k++){
  195. if($value[$i] == $attrib_value[$k]){
  196. $cr++;
  197. }
  198. }
  199. }
  200. if($cr === $nc){
  201. $return[] = $elem;
  202. continue 2;
  203. }
  204. }
  205. }
  206. return $return;
  207. }
  208. public function getElementsByAttributeValue(string $name, string $value, $collection = null){
  209. $elems = $this->getElementsByAttributeName($name, $collection);
  210. $return = [];
  211. foreach($elems as $elem){
  212. foreach($elem["attributes"] as $attrib_name => $attrib_value){
  213. if($attrib_value == $value){
  214. $return[] = $elem;
  215. continue 2;
  216. }
  217. }
  218. }
  219. return $return;
  220. }
  221. public function getElementById(string $idname, $collection = null){
  222. $id = $this->getElementsByAttributeValue("id", $idname, $collection);
  223. if(count($id) !== 0){
  224. return $id[0];
  225. }
  226. return false;
  227. }
  228. public function getElementsByClassName(string $classname, $collection = null){
  229. return $this->getElementsByFuzzyAttributeValue("class", $classname, $collection);
  230. }
  231. public function getTextContent($html, $whitespace = false, $trim = true){
  232. if(is_array($html)){
  233. if(!isset($html["innerHTML"])){
  234. throw new Exception("(getTextContent) Supplied array doesn't contain an innerHTML index");
  235. }
  236. $html = $html["innerHTML"];
  237. }
  238. $html = preg_split('/\n|<\/?br>/i', $html);
  239. $out = "";
  240. for($i=0; $i<count($html); $i++){
  241. $tmp =
  242. html_entity_decode(
  243. strip_tags(
  244. $html[$i]
  245. ),
  246. ENT_QUOTES | ENT_XML1, "UTF-8"
  247. );
  248. if($trim){
  249. $tmp = trim($tmp);
  250. }
  251. $out .= $tmp;
  252. if($whitespace === true){
  253. $out .= "\n";
  254. }else{
  255. $out .= " ";
  256. }
  257. }
  258. if($trim){
  259. return trim($out);
  260. }
  261. return $out;
  262. }
  263. public function parseJsObject(string $json){
  264. $bracket = false;
  265. $is_close_bracket = false;
  266. $escape = false;
  267. $lastchar = false;
  268. $json_out = null;
  269. $last_char = null;
  270. $keyword_check = null;
  271. for($i=0; $i<strlen($json); $i++){
  272. switch($json[$i]){
  273. case "\"":
  274. case "'":
  275. if($escape === true){
  276. break;
  277. }
  278. if($json[$i] == $bracket){
  279. $bracket = false;
  280. $is_close_bracket = true;
  281. }else{
  282. if($bracket === false){
  283. $bracket = $json[$i];
  284. }
  285. }
  286. break;
  287. default:
  288. $is_close_bracket = false;
  289. break;
  290. }
  291. if(
  292. $json[$i] == "\\" &&
  293. !(
  294. $lastchar !== false &&
  295. $lastchar . $json[$i] == "\\\\"
  296. )
  297. ){
  298. $escape = true;
  299. }else{
  300. $escape = false;
  301. }
  302. if(
  303. $bracket === false &&
  304. $is_close_bracket === false
  305. ){
  306. // do keyword check
  307. $keyword_check .= $json[$i];
  308. if(in_array($json[$i], [":", "{"])){
  309. $keyword_check = substr($keyword_check, 0, -1);
  310. if(
  311. preg_match(
  312. '/function|array|return/i',
  313. $keyword_check
  314. )
  315. ){
  316. $json_out =
  317. preg_replace(
  318. '/[{"]*' . preg_quote($keyword_check, "/") . '$/',
  319. "",
  320. $json_out
  321. );
  322. }
  323. $keyword_check = null;
  324. }
  325. // here we know we're not iterating over a quoted string
  326. switch($json[$i]){
  327. case "[":
  328. case "{":
  329. $json_out .= $json[$i];
  330. break;
  331. case "]":
  332. case "}":
  333. case ",":
  334. case ":":
  335. if(!in_array($last_char, ["[", "{", "}", "]", "\""])){
  336. $json_out .= "\"";
  337. }
  338. $json_out .= $json[$i];
  339. break;
  340. default:
  341. if(in_array($last_char, ["{", "[", ",", ":"])){
  342. $json_out .= "\"";
  343. }
  344. $json_out .= $json[$i];
  345. break;
  346. }
  347. }else{
  348. $json_out .= $json[$i];
  349. }
  350. $last_char = $json[$i];
  351. }
  352. return json_decode($json_out, true);
  353. }
  354. public function parseJsString($string){
  355. return
  356. preg_replace_callback(
  357. '/\\\u[A-Fa-f0-9]{4}|\\\x[A-Fa-f0-9]{2}|\\\n|\\\r/',
  358. function($match){
  359. switch($match[0][1]){
  360. case "u":
  361. return json_decode('"' . $match[0] . '"');
  362. break;
  363. case "x":
  364. return mb_convert_encoding(
  365. stripcslashes($match[0]),
  366. "utf-8",
  367. "windows-1252"
  368. );
  369. break;
  370. default:
  371. return " ";
  372. break;
  373. }
  374. },
  375. $string
  376. );
  377. }
  378. public function extract_json($json){
  379. $len = strlen($json);
  380. $array_level = 0;
  381. $object_level = 0;
  382. $in_quote = null;
  383. $start = null;
  384. for($i=0; $i<$len; $i++){
  385. switch($json[$i]){
  386. case "[":
  387. if($in_quote === null){
  388. $array_level++;
  389. if($start === null){
  390. $start = $i;
  391. }
  392. }
  393. break;
  394. case "]":
  395. if($in_quote === null){
  396. $array_level--;
  397. }
  398. break;
  399. case "{":
  400. if($in_quote === null){
  401. $object_level++;
  402. if($start === null){
  403. $start = $i;
  404. }
  405. }
  406. break;
  407. case "}":
  408. if($in_quote === null){
  409. $object_level--;
  410. }
  411. break;
  412. case "\"":
  413. case "'":
  414. if(
  415. $i !== 0 &&
  416. $json[$i - 1] !== "\\"
  417. ){
  418. // found a non-escaped quote
  419. if($in_quote === null){
  420. // open quote
  421. $in_quote = $json[$i];
  422. }elseif($in_quote === $json[$i]){
  423. // close quote
  424. $in_quote = null;
  425. }
  426. }
  427. break;
  428. }
  429. if(
  430. $start !== null &&
  431. $array_level === 0 &&
  432. $object_level === 0
  433. ){
  434. return substr($json, $start, $i - $start + 1);
  435. break;
  436. }
  437. }
  438. }
  439. }