123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610 |
- <?php
- class fuckhtml{
-
- public function __construct($html = null, $isfile = false){
-
- if($html !== null){
-
- $this->load($html, $isfile);
- }
- }
-
- public function load($html, $isfile = false){
-
- if(is_array($html)){
-
- if(!isset($html["innerHTML"])){
-
- throw new Exception("(load) Supplied array doesn't contain an innerHTML index");
- }
- $html = $html["innerHTML"];
- }
-
- if($isfile){
-
- $handle = fopen($html, "r");
- $fetch = fread($handle, filesize($html));
- fclose($handle);
-
- $this->html = $fetch;
- }else{
-
- $this->html = $html;
- }
-
- $this->strlen = strlen($this->html);
- }
-
- public function getloadedhtml(){
-
- return $this->html;
- }
-
- public function getElementsByTagName(string $tagname){
-
- $out = [];
-
- /*
- Scrape start of the tag. Example
- <div class="mydiv"> ...
- */
-
- if($tagname == "*"){
-
- $tagname = '[A-Za-z0-9._-]+';
- }else{
-
- $tagname = preg_quote(strtolower($tagname));
- }
-
- preg_match_all(
- '/<\s*(' . $tagname . ')(\s(?:[^>\'"]*|"[^"]*"|\'[^\']*\')+)?\s*>/i',
- /* '/<\s*(' . $tagname . ')(\s[\S\s]*?)?>/i', */
- $this->html,
- $starting_tags,
- PREG_OFFSET_CAPTURE
- );
-
- for($i=0; $i<count($starting_tags[0]); $i++){
-
- /*
- Parse attributes
- */
- $attributes = [];
- preg_match_all(
- '/([^\/\s\\=]+)(?:\s*=\s*("[^"]*"|\'[^\']*\'|[^\s]*))?/i',
- $starting_tags[2][$i][0],
- $regex_attributes
- );
-
- for($k=0; $k<count($regex_attributes[0]); $k++){
-
- if(trim($regex_attributes[2][$k]) == ""){
-
- $attributes[$regex_attributes[1][$k]] =
- "true";
-
- continue;
- }
-
- $attributes[strtolower($regex_attributes[1][$k])] =
- trim($regex_attributes[2][$k], "'\" \n\r\t\v\x00");
- }
-
- $out[] = [
- "tagName" => strtolower($starting_tags[1][$i][0]),
- "startPos" => $starting_tags[0][$i][1],
- "endPos" => 0,
- "startTag" => $starting_tags[0][$i][0],
- "attributes" => $attributes,
- "innerHTML" => null
- ];
- }
-
- /*
- Get innerHTML
- */
- // get closing tag positions
- preg_match_all(
- '/<\s*\/\s*(' . $tagname . ')\s*>/i',
- $this->html,
- $regex_closing_tags,
- PREG_OFFSET_CAPTURE
- );
-
- // merge opening and closing tags together
- for($i=0; $i<count($regex_closing_tags[1]); $i++){
-
- $out[] = [
- "tagName" => strtolower($regex_closing_tags[1][$i][0]),
- "endTag" => $regex_closing_tags[0][$i][0],
- "startPos" => $regex_closing_tags[0][$i][1]
- ];
- }
-
- usort(
- $out,
- function($a, $b){
-
- return $a["startPos"] > $b["startPos"];
- }
- );
-
- // compute the indent level for each element
- $level = [];
- $count = count($out);
-
- for($i=0; $i<$count; $i++){
-
- if(!isset($level[$out[$i]["tagName"]])){
-
- $level[$out[$i]["tagName"]] = 0;
- }
-
- if(isset($out[$i]["startTag"])){
-
- // encountered starting tag
- $level[$out[$i]["tagName"]]++;
- $out[$i]["level"] = $level[$out[$i]["tagName"]];
- }else{
-
- // encountered closing tag
- $out[$i]["level"] = $level[$out[$i]["tagName"]];
- $level[$out[$i]["tagName"]]--;
- }
- }
-
- // if the indent level is the same for a div,
- // we encountered _THE_ closing tag
- for($i=0; $i<$count; $i++){
-
- if(!isset($out[$i]["startTag"])){
-
- continue;
- }
-
- for($k=$i; $k<$count; $k++){
-
- if(
- isset($out[$k]["endTag"]) &&
- $out[$i]["tagName"] == $out[$k]["tagName"] &&
- $out[$i]["level"]
- === $out[$k]["level"]
- ){
-
- $startlen = strlen($out[$i]["startTag"]);
- $endlen = strlen($out[$k]["endTag"]);
-
- $out[$i]["endPos"] = $out[$k]["startPos"] + $endlen;
-
- $out[$i]["innerHTML"] =
- substr(
- $this->html,
- $out[$i]["startPos"] + $startlen,
- $out[$k]["startPos"] - ($out[$i]["startPos"] + $startlen)
- );
-
- $out[$i]["outerHTML"] =
- substr(
- $this->html,
- $out[$i]["startPos"],
- $out[$k]["startPos"] - $out[$i]["startPos"] + $endlen
- );
-
- break;
- }
- }
- }
-
- // filter out ending divs
- for($i=0; $i<$count; $i++){
-
- if(isset($out[$i]["endTag"])){
-
- unset($out[$i]);
- }
-
- unset($out[$i]["startTag"]);
- }
-
- return array_values($out);
- }
-
- public function getElementsByAttributeName(string $name, $collection = null){
-
- if($collection === null){
-
- $collection = $this->getElementsByTagName("*");
- }elseif(is_string($collection)){
-
- $collection = $this->getElementsByTagName($collection);
- }
-
- $return = [];
- foreach($collection as $elem){
-
- foreach($elem["attributes"] as $attrib_name => $attrib_value){
-
- if($attrib_name == $name){
-
- $return[] = $elem;
- continue 2;
- }
- }
- }
-
- return $return;
- }
-
- public function getElementsByFuzzyAttributeValue(string $name, string $value, $collection = null){
-
- $elems = $this->getElementsByAttributeName($name, $collection);
- $value =
- explode(
- " ",
- trim(
- preg_replace(
- '/ +/',
- " ",
- $value
- )
- )
- );
-
- $return = [];
-
- foreach($elems as $elem){
-
- foreach($elem["attributes"] as $attrib_name => $attrib_value){
-
- $attrib_value = explode(" ", $attrib_value);
- $ac = count($attrib_value);
- $nc = count($value);
- $cr = 0;
-
- for($i=0; $i<$nc; $i++){
-
- for($k=0; $k<$ac; $k++){
-
- if($value[$i] == $attrib_value[$k]){
-
- $cr++;
- }
- }
- }
-
- if($cr === $nc){
-
- $return[] = $elem;
- continue 2;
- }
- }
- }
-
- return $return;
- }
-
- public function getElementsByAttributeValue(string $name, string $value, $collection = null){
-
- $elems = $this->getElementsByAttributeName($name, $collection);
-
- $return = [];
-
- foreach($elems as $elem){
-
- foreach($elem["attributes"] as $attrib_name => $attrib_value){
-
- if($attrib_value == $value){
-
- $return[] = $elem;
- continue 2;
- }
- }
- }
-
- return $return;
- }
-
- public function getElementById(string $idname, $collection = null){
-
- $id = $this->getElementsByAttributeValue("id", $idname, $collection);
-
- if(count($id) !== 0){
-
- return $id[0];
- }
-
- return false;
- }
-
- public function getElementsByClassName(string $classname, $collection = null){
-
- return $this->getElementsByFuzzyAttributeValue("class", $classname, $collection);
- }
-
- public function getTextContent($html, $whitespace = false, $trim = true){
-
- if(is_array($html)){
-
- if(!isset($html["innerHTML"])){
-
- throw new Exception("(getTextContent) Supplied array doesn't contain an innerHTML index");
- }
-
- $html = $html["innerHTML"];
- }
-
- $html = preg_split('/\n|<\/?br>/i', $html);
-
- $out = "";
- for($i=0; $i<count($html); $i++){
-
- $tmp =
- html_entity_decode(
- strip_tags(
- $html[$i]
- ),
- ENT_QUOTES | ENT_XML1, "UTF-8"
- );
-
- if($trim){
-
- $tmp = trim($tmp);
- }
-
- $out .= $tmp;
-
- if($whitespace === true){
-
- $out .= "\n";
- }else{
-
- $out .= " ";
- }
- }
-
- if($trim){
-
- return trim($out);
- }
-
- return $out;
- }
-
- public function parseJsObject(string $json){
-
- $bracket = false;
- $is_close_bracket = false;
- $escape = false;
- $lastchar = false;
- $json_out = null;
- $last_char = null;
-
- $keyword_check = null;
-
- for($i=0; $i<strlen($json); $i++){
-
- switch($json[$i]){
-
- case "\"":
- case "'":
- if($escape === true){
-
- break;
- }
-
- if($json[$i] == $bracket){
-
- $bracket = false;
- $is_close_bracket = true;
-
- }else{
-
- if($bracket === false){
-
- $bracket = $json[$i];
- }
- }
- break;
-
- default:
- $is_close_bracket = false;
- break;
- }
-
- if(
- $json[$i] == "\\" &&
- !(
- $lastchar !== false &&
- $lastchar . $json[$i] == "\\\\"
- )
- ){
-
- $escape = true;
- }else{
-
- $escape = false;
- }
-
- if(
- $bracket === false &&
- $is_close_bracket === false
- ){
-
- // do keyword check
- $keyword_check .= $json[$i];
-
- if(in_array($json[$i], [":", "{"])){
-
- $keyword_check = substr($keyword_check, 0, -1);
-
- if(
- preg_match(
- '/function|array|return/i',
- $keyword_check
- )
- ){
-
- $json_out =
- preg_replace(
- '/[{"]*' . preg_quote($keyword_check, "/") . '$/',
- "",
- $json_out
- );
- }
-
- $keyword_check = null;
- }
-
- // here we know we're not iterating over a quoted string
- switch($json[$i]){
-
- case "[":
- case "{":
- $json_out .= $json[$i];
- break;
-
- case "]":
- case "}":
- case ",":
- case ":":
- if(!in_array($last_char, ["[", "{", "}", "]", "\""])){
-
- $json_out .= "\"";
- }
-
- $json_out .= $json[$i];
- break;
-
- default:
- if(in_array($last_char, ["{", "[", ",", ":"])){
-
- $json_out .= "\"";
- }
-
- $json_out .= $json[$i];
- break;
- }
- }else{
-
- $json_out .= $json[$i];
- }
-
- $last_char = $json[$i];
- }
-
- return json_decode($json_out, true);
- }
-
- public function parseJsString($string){
-
- return
- preg_replace_callback(
- '/\\\u[A-Fa-f0-9]{4}|\\\x[A-Fa-f0-9]{2}|\\\n|\\\r/',
- function($match){
-
- switch($match[0][1]){
-
- case "u":
- return json_decode('"' . $match[0] . '"');
- break;
-
- case "x":
- return mb_convert_encoding(
- stripcslashes($match[0]),
- "utf-8",
- "windows-1252"
- );
- break;
-
- default:
- return " ";
- break;
- }
- },
- $string
- );
- }
-
- public function extract_json($json){
-
- $len = strlen($json);
- $array_level = 0;
- $object_level = 0;
- $in_quote = null;
- $start = null;
-
- for($i=0; $i<$len; $i++){
-
- switch($json[$i]){
-
- case "[":
- if($in_quote === null){
-
- $array_level++;
- if($start === null){
-
- $start = $i;
- }
- }
- break;
-
- case "]":
- if($in_quote === null){
-
- $array_level--;
- }
- break;
-
- case "{":
- if($in_quote === null){
-
- $object_level++;
- if($start === null){
-
- $start = $i;
- }
- }
- break;
-
- case "}":
- if($in_quote === null){
-
- $object_level--;
- }
- break;
-
- case "\"":
- case "'":
- if(
- $i !== 0 &&
- $json[$i - 1] !== "\\"
- ){
- // found a non-escaped quote
-
- if($in_quote === null){
-
- // open quote
- $in_quote = $json[$i];
- }elseif($in_quote === $json[$i]){
-
- // close quote
- $in_quote = null;
- }
- }
- break;
- }
-
- if(
- $start !== null &&
- $array_level === 0 &&
- $object_level === 0
- ){
-
- return substr($json, $start, $i - $start + 1);
- break;
- }
- }
- }
- }
|