google.php 90 KB

  1. <?php
  2. // @TODO check for page, if need be
  3. class google{
  4. public function __construct(){
  5. include "lib/fuckhtml.php";
  6. $this->fuckhtml = new fuckhtml();
  7. include "lib/backend.php";
  8. $this->backend = new backend("google");
  9. }
  10. public function getfilters($page){
  11. $base = [
  12. "country" => [ // gl=<country> (image: cr=countryAF)
  13. "display" => "Country",
  14. "option" => [
  15. "any" => "Instance's country",
  16. "af" => "Afghanistan",
  17. "al" => "Albania",
  18. "dz" => "Algeria",
  19. "as" => "American Samoa",
  20. "ad" => "Andorra",
  21. "ao" => "Angola",
  22. "ai" => "Anguilla",
  23. "aq" => "Antarctica",
  24. "ag" => "Antigua and Barbuda",
  25. "ar" => "Argentina",
  26. "am" => "Armenia",
  27. "aw" => "Aruba",
  28. "au" => "Australia",
  29. "at" => "Austria",
  30. "az" => "Azerbaijan",
  31. "bs" => "Bahamas",
  32. "bh" => "Bahrain",
  33. "bd" => "Bangladesh",
  34. "bb" => "Barbados",
  35. "by" => "Belarus",
  36. "be" => "Belgium",
  37. "bz" => "Belize",
  38. "bj" => "Benin",
  39. "bm" => "Bermuda",
  40. "bt" => "Bhutan",
  41. "bo" => "Bolivia",
  42. "ba" => "Bosnia and Herzegovina",
  43. "bw" => "Botswana",
  44. "bv" => "Bouvet Island",
  45. "br" => "Brazil",
  46. "io" => "British Indian Ocean Territory",
  47. "bn" => "Brunei Darussalam",
  48. "bg" => "Bulgaria",
  49. "bf" => "Burkina Faso",
  50. "bi" => "Burundi",
  51. "kh" => "Cambodia",
  52. "cm" => "Cameroon",
  53. "ca" => "Canada",
  54. "cv" => "Cape Verde",
  55. "ky" => "Cayman Islands",
  56. "cf" => "Central African Republic",
  57. "td" => "Chad",
  58. "cl" => "Chile",
  59. "cn" => "China",
  60. "cx" => "Christmas Island",
  61. "cc" => "Cocos (Keeling) Islands",
  62. "co" => "Colombia",
  63. "km" => "Comoros",
  64. "cg" => "Congo",
  65. "cd" => "Congo, the Democratic Republic",
  66. "ck" => "Cook Islands",
  67. "cr" => "Costa Rica",
  68. "ci" => "Cote D'ivoire",
  69. "hr" => "Croatia",
  70. "cu" => "Cuba",
  71. "cy" => "Cyprus",
  72. "cz" => "Czech Republic",
  73. "dk" => "Denmark",
  74. "dj" => "Djibouti",
  75. "dm" => "Dominica",
  76. "do" => "Dominican Republic",
  77. "ec" => "Ecuador",
  78. "eg" => "Egypt",
  79. "sv" => "El Salvador",
  80. "gq" => "Equatorial Guinea",
  81. "er" => "Eritrea",
  82. "ee" => "Estonia",
  83. "et" => "Ethiopia",
  84. "fk" => "Falkland Islands (Malvinas)",
  85. "fo" => "Faroe Islands",
  86. "fj" => "Fiji",
  87. "fi" => "Finland",
  88. "fr" => "France",
  89. "gf" => "French Guiana",
  90. "pf" => "French Polynesia",
  91. "tf" => "French Southern Territories",
  92. "ga" => "Gabon",
  93. "gm" => "Gambia",
  94. "ge" => "Georgia",
  95. "de" => "Germany",
  96. "gh" => "Ghana",
  97. "gi" => "Gibraltar",
  98. "gr" => "Greece",
  99. "gl" => "Greenland",
  100. "gd" => "Grenada",
  101. "gp" => "Guadeloupe",
  102. "gu" => "Guam",
  103. "gt" => "Guatemala",
  104. "gn" => "Guinea",
  105. "gw" => "Guinea-Bissau",
  106. "gy" => "Guyana",
  107. "ht" => "Haiti",
  108. "hm" => "Heard Island and Mcdonald Islands",
  109. "va" => "Holy See (Vatican City State)",
  110. "hn" => "Honduras",
  111. "hk" => "Hong Kong",
  112. "hu" => "Hungary",
  113. "is" => "Iceland",
  114. "in" => "India",
  115. "id" => "Indonesia",
  116. "ir" => "Iran, Islamic Republic",
  117. "iq" => "Iraq",
  118. "ie" => "Ireland",
  119. "il" => "Israel",
  120. "it" => "Italy",
  121. "jm" => "Jamaica",
  122. "jp" => "Japan",
  123. "jo" => "Jordan",
  124. "kz" => "Kazakhstan",
  125. "ke" => "Kenya",
  126. "ki" => "Kiribati",
  127. "kp" => "Korea, Democratic People's Republic",
  128. "kr" => "Korea, Republic",
  129. "kw" => "Kuwait",
  130. "kg" => "Kyrgyzstan",
  131. "la" => "Lao People's Democratic Republic",
  132. "lv" => "Latvia",
  133. "lb" => "Lebanon",
  134. "ls" => "Lesotho",
  135. "lr" => "Liberia",
  136. "ly" => "Libyan Arab Jamahiriya",
  137. "li" => "Liechtenstein",
  138. "lt" => "Lithuania",
  139. "lu" => "Luxembourg",
  140. "mo" => "Macao",
  141. "mk" => "Macedonia, the Former Yugosalv Republic",
  142. "mg" => "Madagascar",
  143. "mw" => "Malawi",
  144. "my" => "Malaysia",
  145. "mv" => "Maldives",
  146. "ml" => "Mali",
  147. "mt" => "Malta",
  148. "mh" => "Marshall Islands",
  149. "mq" => "Martinique",
  150. "mr" => "Mauritania",
  151. "mu" => "Mauritius",
  152. "yt" => "Mayotte",
  153. "mx" => "Mexico",
  154. "fm" => "Micronesia, Federated States",
  155. "md" => "Moldova, Republic",
  156. "mc" => "Monaco",
  157. "mn" => "Mongolia",
  158. "ms" => "Montserrat",
  159. "ma" => "Morocco",
  160. "mz" => "Mozambique",
  161. "mm" => "Myanmar",
  162. "na" => "Namibia",
  163. "nr" => "Nauru",
  164. "np" => "Nepal",
  165. "nl" => "Netherlands",
  166. "an" => "Netherlands Antilles",
  167. "nc" => "New Caledonia",
  168. "nz" => "New Zealand",
  169. "ni" => "Nicaragua",
  170. "ne" => "Niger",
  171. "ng" => "Nigeria",
  172. "nu" => "Niue",
  173. "nf" => "Norfolk Island",
  174. "mp" => "Northern Mariana Islands",
  175. "no" => "Norway",
  176. "om" => "Oman",
  177. "pk" => "Pakistan",
  178. "pw" => "Palau",
  179. "ps" => "Palestinian Territory, Occupied",
  180. "pa" => "Panama",
  181. "pg" => "Papua New Guinea",
  182. "py" => "Paraguay",
  183. "pe" => "Peru",
  184. "ph" => "Philippines",
  185. "pn" => "Pitcairn",
  186. "pl" => "Poland",
  187. "pt" => "Portugal",
  188. "pr" => "Puerto Rico",
  189. "qa" => "Qatar",
  190. "re" => "Reunion",
  191. "ro" => "Romania",
  192. "ru" => "Russian Federation",
  193. "rw" => "Rwanda",
  194. "sh" => "Saint Helena",
  195. "kn" => "Saint Kitts and Nevis",
  196. "lc" => "Saint Lucia",
  197. "pm" => "Saint Pierre and Miquelon",
  198. "vc" => "Saint Vincent and the Grenadines",
  199. "ws" => "Samoa",
  200. "sm" => "San Marino",
  201. "st" => "Sao Tome and Principe",
  202. "sa" => "Saudi Arabia",
  203. "sn" => "Senegal",
  204. "cs" => "Serbia and Montenegro",
  205. "sc" => "Seychelles",
  206. "sl" => "Sierra Leone",
  207. "sg" => "Singapore",
  208. "sk" => "Slovakia",
  209. "si" => "Slovenia",
  210. "sb" => "Solomon Islands",
  211. "so" => "Somalia",
  212. "za" => "South Africa",
  213. "gs" => "South Georgia and the South Sandwich Islands",
  214. "es" => "Spain",
  215. "lk" => "Sri Lanka",
  216. "sd" => "Sudan",
  217. "sr" => "Suriname",
  218. "sj" => "Svalbard and Jan Mayen",
  219. "sz" => "Swaziland",
  220. "se" => "Sweden",
  221. "ch" => "Switzerland",
  222. "sy" => "Syrian Arab Republic",
  223. "tw" => "Taiwan, Province of China",
  224. "tj" => "Tajikistan",
  225. "tz" => "Tanzania, United Republic",
  226. "th" => "Thailand",
  227. "tl" => "Timor-Leste",
  228. "tg" => "Togo",
  229. "tk" => "Tokelau",
  230. "to" => "Tonga",
  231. "tt" => "Trinidad and Tobago",
  232. "tn" => "Tunisia",
  233. "tr" => "Turkey",
  234. "tm" => "Turkmenistan",
  235. "tc" => "Turks and Caicos Islands",
  236. "tv" => "Tuvalu",
  237. "ug" => "Uganda",
  238. "ua" => "Ukraine",
  239. "ae" => "United Arab Emirates",
  240. "uk" => "United Kingdom",
  241. "us" => "United States",
  242. "um" => "United States Minor Outlying Islands",
  243. "uy" => "Uruguay",
  244. "uz" => "Uzbekistan",
  245. "vu" => "Vanuatu",
  246. "ve" => "Venezuela",
  247. "vn" => "Viet Nam",
  248. "vg" => "Virgin Islands, British",
  249. "vi" => "Virgin Islands, U.S.",
  250. "wf" => "Wallis and Futuna",
  251. "eh" => "Western Sahara",
  252. "ye" => "Yemen",
  253. "zm" => "Zambia",
  254. "zw" => "Zimbabwe"
  255. ]
  256. ],
  257. "nsfw" => [
  258. "display" => "NSFW",
  259. "option" => [
  260. "yes" => "Yes", // safe=active
  261. "no" => "No" // safe=off
  262. ]
  263. ]
  264. ];
  265. switch($page){
  266. case "web":
  267. return array_merge(
  268. $base,
  269. [
  270. "lang" => [ // lr=<lang> (prefix lang with "lang_")
  271. "display" => "Language",
  272. "option" => [
  273. "any" => "Any language",
  274. "ar" => "Arabic",
  275. "bg" => "Bulgarian",
  276. "ca" => "Catalan",
  277. "cs" => "Czech",
  278. "da" => "Danish",
  279. "de" => "German",
  280. "el" => "Greek",
  281. "en" => "English",
  282. "es" => "Spanish",
  283. "et" => "Estonian",
  284. "fi" => "Finnish",
  285. "fr" => "French",
  286. "hr" => "Croatian",
  287. "hu" => "Hungarian",
  288. "id" => "Indonesian",
  289. "is" => "Icelandic",
  290. "it" => "Italian",
  291. "iw" => "Hebrew",
  292. "ja" => "Japanese",
  293. "ko" => "Korean",
  294. "lt" => "Lithuanian",
  295. "lv" => "Latvian",
  296. "nl" => "Dutch",
  297. "no" => "Norwegian",
  298. "pl" => "Polish",
  299. "pt" => "Portuguese",
  300. "ro" => "Romanian",
  301. "ru" => "Russian",
  302. "sk" => "Slovak",
  303. "sl" => "Slovenian",
  304. "sr" => "Serbian",
  305. "sv" => "Swedish",
  306. "tr" => "Turkish",
  307. "zh-CN" => "Chinese (Simplified)",
  308. "zh-TW" => "Chinese (Traditional)"
  309. ]
  310. ],
  311. "newer" => [ // tbs
  312. "display" => "Newer than",
  313. "option" => "_DATE"
  314. ],
  315. "older" => [
  316. "display" => "Older than",
  317. "option" => "_DATE"
  318. ],
  319. "spellcheck" => [
  320. "display" => "Spellcheck",
  321. "option" => [
  322. "yes" => "Yes",
  323. "no" => "No"
  324. ]
  325. ]
  326. ]
  327. );
  328. break;
  329. case "images":
  330. return array_merge(
  331. $base,
  332. [
  333. "time" => [ // tbs=qdr:<time>
  334. "display" => "Time posted",
  335. "option" => [
  336. "any" => "Any time",
  337. "d" => "Past 24 hours",
  338. "w" => "Past week",
  339. "m" => "Past month",
  340. "y" => "Past year"
  341. ]
  342. ],
  343. "size" => [ // imgsz
  344. "display" => "Size",
  345. "option" => [
  346. "any" => "Any size",
  347. "l" => "Large",
  348. "m" => "Medium",
  349. "i" => "Icon",
  350. "qsvga" => "Larger than 400x300",
  351. "vga" => "Larger than 640x480",
  352. "svga" => "Larger than 800x600",
  353. "xga" => "Larger than 1024x768",
  354. "2mp" => "Larger than 2MP",
  355. "4mp" => "Larger than 4MP",
  356. "6mp" => "Larger than 6MP",
  357. "8mp" => "Larger than 8MP",
  358. "10mp" => "Larger than 10MP",
  359. "12mp" => "Larger than 12MP",
  360. "15mp" => "Larger than 15MP",
  361. "20mp" => "Larger than 20MP",
  362. "40mp" => "Larger than 40MP",
  363. "70mp" => "Larger than 70MP"
  364. ]
  365. ],
  366. "ratio" => [ // imgar
  367. "display" => "Aspect ratio",
  368. "option" => [
  369. "any" => "Any ratio",
  370. "t|xt" => "Tall",
  371. "s" => "Square",
  372. "w" => "Wide",
  373. "xw" => "Panoramic"
  374. ]
  375. ],
  376. "color" => [ // imgc
  377. "display" => "Color",
  378. "option" => [
  379. "any" => "Any color",
  380. "color" => "Full color",
  381. "bnw" => "Black & white",
  382. "trans" => "Transparent",
  383. // from here, imgcolor
  384. "red" => "Red",
  385. "orange" => "Orange",
  386. "yellow" => "Yellow",
  387. "green" => "Green",
  388. "teal" => "Teal",
  389. "blue" => "Blue",
  390. "purple" => "Purple",
  391. "pink" => "Pink",
  392. "white" => "White",
  393. "gray" => "Gray",
  394. "black" => "Black",
  395. "brown" => "Brown"
  396. ]
  397. ],
  398. "type" => [ // tbs=itp:<type>
  399. "display" => "Type",
  400. "option" => [
  401. "any" => "Any type",
  402. "clipart" => "Clip Art",
  403. "lineart" => "Line Drawing",
  404. "animated" => "Animated"
  405. ]
  406. ],
  407. "format" => [ // as_filetype
  408. "display" => "Format",
  409. "option" => [
  410. "any" => "Any format",
  411. "jpg" => "JPG",
  412. "gif" => "GIF",
  413. "png" => "PNG",
  414. "bmp" => "BMP",
  415. "svg" => "SVG",
  416. "webp" => "WEBP",
  417. "ico" => "ICO",
  418. "craw" => "RAW"
  419. ]
  420. ],
  421. "rights" => [ // tbs=sur:<rights>
  422. "display" => "Usage rights",
  423. "option" => [
  424. "any" => "Any license",
  425. "cl" => "Creative Commons licenses",
  426. "ol" => "Commercial & other licenses"
  427. ]
  428. ]
  429. ]
  430. );
  431. break;
  432. case "videos":
  433. return array_merge(
  434. $base,
  435. [
  436. "newer" => [ // tbs
  437. "display" => "Newer than",
  438. "option" => "_DATE"
  439. ],
  440. "older" => [
  441. "display" => "Older than",
  442. "option" => "_DATE"
  443. ],
  444. "duration" => [
  445. "display" => "Duration",
  446. "option" => [
  447. "any" => "Any duration",
  448. "s" => "Short (0-4min)", // tbs=dur:s
  449. "m" => "Medium (4-20min)", // tbs=dur:m
  450. "l" => "Long (20+ min)" // tbs=dur:l
  451. ]
  452. ],
  453. "quality" => [
  454. "display" => "Quality",
  455. "option" => [
  456. "any" => "Any quality",
  457. "h" => "High quality" // tbs=hq:h
  458. ]
  459. ],
  460. "captions" => [
  461. "display" => "Captions",
  462. "option" => [
  463. "any" => "No preference",
  464. "yes" => "Closed captioned" // tbs=cc:1
  465. ]
  466. ]
  467. ]
  468. );
  469. break;
  470. case "news":
  471. return array_merge(
  472. $base,
  473. [
  474. "newer" => [ // tbs
  475. "display" => "Newer than",
  476. "option" => "_DATE"
  477. ],
  478. "older" => [
  479. "display" => "Older than",
  480. "option" => "_DATE"
  481. ],
  482. "sort" => [
  483. "display" => "Sort",
  484. "option" => [
  485. "relevance" => "Relevance",
  486. "date" => "Date" // sbd:1
  487. ]
  488. ]
  489. ]
  490. );
  491. break;
  492. }
  493. }
  494. private function get($proxy, $url, $get = []){
  495. $headers = [
  496. "User-Agent: " . config::USER_AGENT,
  497. "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
  498. "Accept-Language: en-US,en;q=0.5",
  499. "Accept-Encoding: gzip",
  500. "DNT: 1",
  501. //"Cookie: SOCS=CAESNQgCEitib3FfaWRlbnRpdHlmcm9udGVuZHVpc2VydmVyXzIwMjQwMzE3LjA4X3AwGgJlbiAEGgYIgM7orwY",
  502. "Connection: keep-alive",
  503. "Upgrade-Insecure-Requests: 1",
  504. "Sec-Fetch-Dest: document",
  505. "Sec-Fetch-Mode: navigate",
  506. "Sec-Fetch-Site: none",
  507. "Sec-Fetch-User: ?1",
  508. "Priority: u=1",
  509. "TE: trailers"
  510. ];
  511. $curlproc = curl_init();
  512. if($get !== []){
  513. $get = http_build_query($get);
  514. $url .= "?" . $get;
  515. }
  516. curl_setopt($curlproc, CURLOPT_URL, $url);
  517. curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
  518. curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers);
  519. // use http2
  520. curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
  521. curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
  522. curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
  523. curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
  524. curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
  525. curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
  526. // follow redirects
  527. curl_setopt($curlproc, CURLOPT_FOLLOWLOCATION, true);
  528. $this->backend->assign_proxy($curlproc, $proxy);
  529. $data = curl_exec($curlproc);
  530. if(curl_errno($curlproc)){
  531. throw new Exception(curl_error($curlproc));
  532. }
  533. curl_close($curlproc);
  534. return $data;
  535. }
  536. private function parsepage($html, $pagetype, $search, $proxy, $params){
  537. $out = [
  538. "status" => "ok",
  539. "spelling" => [
  540. "type" => "no_correction",
  541. "using" => null,
  542. "correction" => null
  543. ],
  544. "npt" => null,
  545. "answer" => [],
  546. "web" => [],
  547. "image" => [],
  548. "video" => [],
  549. "news" => [],
  550. "related" => []
  551. ];
  552. $this->fuckhtml->load($html);
  553. $this->detect_sorry();
  554. // parse all <style> tags
  555. $this->parsestyles();
  556. // get javascript images
  557. $this->scrape_dimg($html);
  558. // get html blobs
  559. preg_match_all(
  560. '/function\(\){window\.jsl\.dh\(\'([^\']+?)\',\'(.+?[^\'])\'\);/',
  561. $html,
  562. $blobs
  563. );
  564. $this->blobs = [];
  565. if(isset($blobs[1])){
  566. for($i=0; $i<count($blobs[1]); $i++){
  567. $this->blobs[$blobs[1][$i]] =
  568. $this->fuckhtml
  569. ->parseJsString(
  570. $blobs[2][$i]
  571. );
  572. }
  573. }
  574. $this->scrape_imagearr($html);
  575. //
  576. // load result column
  577. //
  578. $result_div =
  579. $this->fuckhtml
  580. ->getElementById(
  581. "center_col",
  582. "div"
  583. );
  584. if($result_div === false){
  585. throw new Exception("Failed to grep result div");
  586. }
  587. $this->fuckhtml->load($result_div);
  588. //
  589. // Get word corrections
  590. //
  591. $correction =
  592. $this->fuckhtml
  593. ->getElementById(
  594. "fprs",
  595. "p"
  596. );
  597. if($correction){
  598. $this->fuckhtml->load($correction);
  599. $a =
  600. $this->fuckhtml
  601. ->getElementsByTagName(
  602. "a"
  603. );
  604. $using =
  605. $this->fuckhtml
  606. ->getElementById(
  607. "fprsl",
  608. $a
  609. );
  610. if($using){
  611. $using =
  612. $this->fuckhtml
  613. ->getTextContent(
  614. $using
  615. );
  616. $spans =
  617. $this->fuckhtml
  618. ->getElementsByTagName(
  619. "span"
  620. );
  621. $type_span =
  622. $this->fuckhtml
  623. ->getTextContent(
  624. $spans[0]
  625. );
  626. $type = "not_many";
  627. if(
  628. stripos(
  629. $type_span,
  630. "Showing results for"
  631. ) !== false
  632. ){
  633. $type = "including";
  634. }
  635. $correction =
  636. $this->fuckhtml
  637. ->getTextContent(
  638. $a[count($a) - 1]
  639. );
  640. $out["spelling"] = [
  641. "type" => $type,
  642. "using" => $using,
  643. "correction" => $correction
  644. ];
  645. }
  646. // reset
  647. $this->fuckhtml->load($result_div);
  648. }else{
  649. // get the "Did you mean?" prompt
  650. $taw =
  651. $this->fuckhtml
  652. ->getElementById(
  653. "taw"
  654. );
  655. if($taw){
  656. $this->fuckhtml->load($taw);
  657. $as =
  658. $this->fuckhtml
  659. ->getElementsByTagName(
  660. "a"
  661. );
  662. if(count($as) !== 0){
  663. $text =
  664. $this->fuckhtml
  665. ->getTextContent(
  666. $as[0]
  667. );
  668. // @TODO implement did_you_mean
  669. $out["spelling"] = [
  670. "type" => "including",
  671. "using" => $search,
  672. "correction" => $text
  673. ];
  674. }
  675. }
  676. $this->fuckhtml->load($result_div);
  677. }
  678. //
  679. // get notices
  680. //
  681. $botstuff =
  682. $this->fuckhtml
  683. ->getElementById(
  684. "botstuff"
  685. );
  686. // important for later
  687. $last_page = false;
  688. if($botstuff){
  689. $this->fuckhtml->load($botstuff);
  690. $cards =
  691. $this->fuckhtml
  692. ->getElementsByClassName(
  693. $this->getstyle(
  694. [
  695. "line-height" => "normal"
  696. ]
  697. ),
  698. "div"
  699. );
  700. foreach($cards as $card){
  701. $this->fuckhtml->load($card);
  702. $h2 =
  703. $this->fuckhtml
  704. ->getElementsByTagName(
  705. "h2"
  706. );
  707. if(count($h2) !== 0){
  708. $title =
  709. $this->fuckhtml
  710. ->getTextContent(
  711. $h2[0]
  712. );
  713. $card["innerHTML"] =
  714. str_replace(
  715. $h2[0]["outerHTML"],
  716. "",
  717. $card["innerHTML"]
  718. );
  719. }else{
  720. $title = "Notice";
  721. }
  722. $div =
  723. $this->fuckhtml
  724. ->getElementsByTagName(
  725. "div"
  726. );
  727. // probe for related searches div, if found, ignore it cause its shit
  728. $probe =
  729. $this->fuckhtml
  730. ->getElementsByAttributeValue(
  731. "role",
  732. "list",
  733. $div
  734. );
  735. // also probe for children
  736. if(count($probe) === 0){
  737. $probe =
  738. $this->fuckhtml
  739. ->getElementsByClassName(
  740. $this->getstyle(
  741. [
  742. "flex-shrink" => "0",
  743. "-moz-box-flex" => "0",
  744. "flex-grow" => "0",
  745. "overflow" => "hidden"
  746. ]
  747. ),
  748. $div
  749. );
  750. }
  751. if(count($probe) === 0){
  752. $description = [];
  753. $as =
  754. $this->fuckhtml
  755. ->getElementsByTagName(
  756. "a"
  757. );
  758. if(count($as) !== 0){
  759. $first = true;
  760. foreach($as as $a){
  761. $text_link =
  762. $this->fuckhtml
  763. ->getTextContent(
  764. $a
  765. );
  766. if(stripos($text_link, "repeat the search") !== false){
  767. $last_page = true;
  768. break 2;
  769. }
  770. $parts =
  771. explode(
  772. $a["outerHTML"],
  773. $card["innerHTML"],
  774. 2
  775. );
  776. $card["innerHTML"] = $parts[1];
  777. $value =
  778. preg_replace(
  779. '/ +/',
  780. " ",
  781. $this->fuckhtml
  782. ->getTextContent(
  783. $parts[0],
  784. false,
  785. false
  786. )
  787. );
  788. if(strlen(trim($value)) !== 0){
  789. $description[] = [
  790. "type" => "text",
  791. "value" => $value
  792. ];
  793. if($first){
  794. $description[0]["value"] =
  795. ltrim($description[0]["value"]);
  796. }
  797. }
  798. $first = false;
  799. $description[] = [
  800. "type" => "link",
  801. "url" =>
  802. $this->fuckhtml
  803. ->getTextContent(
  804. $a["attributes"]
  805. ["href"]
  806. ),
  807. "value" => $text_link
  808. ];
  809. }
  810. $text =
  811. $this->fuckhtml
  812. ->getTextContent(
  813. $card["innerHTML"],
  814. false,
  815. false
  816. );
  817. if(strlen(trim($text)) !== 0){
  818. $description[] = [
  819. "type" => "text",
  820. "value" =>
  821. rtrim(
  822. $text
  823. )
  824. ];
  825. }
  826. }
  827. if(count($description) !== 0){
  828. $out["answer"][] = [
  829. "title" => $title,
  830. "description" => $description,
  831. "url" => null,
  832. "thumb" => null,
  833. "table" => [],
  834. "sublink" => []
  835. ];
  836. }
  837. }
  838. }
  839. // reset
  840. $this->fuckhtml->load($html);
  841. }
  842. //
  843. // get "Related Searches" and "People also search for"
  844. //
  845. $relateds =
  846. $this->fuckhtml
  847. ->getElementsByClassName(
  848. "wyccme",
  849. "div"
  850. );
  851. foreach($relateds as $related){
  852. $text =
  853. $this->fuckhtml
  854. ->getTextContent(
  855. $related
  856. );
  857. if($text == "More results"){ continue; }
  858. $out["related"][] = $text;
  859. }
  860. //
  861. // Get text results
  862. //
  863. $results =
  864. $this->fuckhtml
  865. ->getElementsByClassName(
  866. "g",
  867. "div"
  868. );
  869. $this->skip_next = false;
  870. foreach($results as $result){
  871. if($this->skip_next){
  872. $this->skip_next = false;
  873. continue;
  874. }
  875. $this->fuckhtml->load($result);
  876. $web = [
  877. "title" => null,
  878. "description" => null,
  879. "url" => null,
  880. "date" => null,
  881. "type" => "web",
  882. "thumb" => [
  883. "url" => null,
  884. "ratio" => null
  885. ],
  886. "sublink" => [],
  887. "table" => []
  888. ];
  889. // Detect presence of sublinks
  890. $g =
  891. $this->fuckhtml
  892. ->getElementsByClassName(
  893. "g",
  894. "div"
  895. );
  896. $sublinks = [];
  897. if(count($g) > 0){
  898. $table =
  899. $this->fuckhtml
  900. ->getElementsByTagName(
  901. "table"
  902. );
  903. if(count($table) !== 0){
  904. // found some sublinks!
  905. $this->fuckhtml->load($table[0]);
  906. $tds =
  907. $this->fuckhtml
  908. ->getElementsByTagName(
  909. "td"
  910. );
  911. foreach($tds as $td){
  912. $this->fuckhtml->load($td);
  913. $a =
  914. $this->fuckhtml
  915. ->getElementsByTagName(
  916. "a"
  917. );
  918. if(
  919. count($a) === 0 ||
  920. (
  921. isset($a[0]["attributes"]["class"]) &&
  922. $a[0]["attributes"]["class"] == "fl"
  923. )
  924. ){
  925. continue;
  926. }
  927. $td["innerHTML"] =
  928. str_replace(
  929. $a[0]["outerHTML"],
  930. "",
  931. $td["innerHTML"]
  932. );
  933. $web["sublink"][] = [
  934. "title" =>
  935. $this->titledots(
  936. $this->fuckhtml
  937. ->getTextContent(
  938. $a[0]
  939. )
  940. ),
  941. "description" =>
  942. html_entity_decode(
  943. $this->titledots(
  944. $this->fuckhtml
  945. ->getTextContent(
  946. $td
  947. )
  948. )
  949. ),
  950. "url" =>
  951. $this->unshiturl(
  952. $a[0]
  953. ["attributes"]
  954. ["href"]
  955. ),
  956. "date" => null
  957. ];
  958. }
  959. // reset
  960. $this->fuckhtml->load($result);
  961. }
  962. // skip on next iteration
  963. $this->skip_next = true;
  964. }
  965. // get title
  966. $h3 =
  967. $this->fuckhtml
  968. ->getElementsByTagName(
  969. "h3"
  970. );
  971. if(count($h3) === 0){
  972. continue;
  973. }
  974. $web["title"] =
  975. $this->titledots(
  976. $this->fuckhtml
  977. ->getTextContent(
  978. $h3[0]
  979. )
  980. );
  981. // get url
  982. $as =
  983. $this->fuckhtml
  984. ->getElementsByTagName(
  985. "a"
  986. );
  987. $web["url"] =
  988. $this->unshiturl(
  989. $as[0]
  990. ["attributes"]
  991. ["href"]
  992. );
  993. if(
  994. !preg_match(
  995. '/^http/',
  996. $web["url"]
  997. )
  998. ){
  999. // skip if invalid url is found
  1000. continue;
  1001. }
  1002. //
  1003. // probe for twitter carousel
  1004. //
  1005. $carousel =
  1006. $this->fuckhtml
  1007. ->getElementsByTagName(
  1008. "g-scrolling-carousel"
  1009. );
  1010. if(count($carousel) !== 0){
  1011. $this->fuckhtml->load($carousel[0]);
  1012. $items =
  1013. $this->fuckhtml
  1014. ->getElementsByTagName(
  1015. "g-inner-card"
  1016. );
  1017. $has_thumbnail = false;
  1018. foreach($items as $item){
  1019. $this->fuckhtml->load($item);
  1020. if($has_thumbnail === false){
  1021. // get thumbnail
  1022. $thumb =
  1023. $this->fuckhtml
  1024. ->getElementsByTagName(
  1025. "img"
  1026. );
  1027. if(
  1028. count($thumb) !== 0 &&
  1029. isset($thumb[0]["attributes"]["id"])
  1030. ){
  1031. $web["thumb"] = [
  1032. "url" =>
  1033. $this->getdimg(
  1034. $thumb[0]["attributes"]["id"]
  1035. ),
  1036. "ratio" => "16:9"
  1037. ];
  1038. $has_thumbnail = true;
  1039. }
  1040. // or else, try getting a thumbnail from next container
  1041. }
  1042. // cache div
  1043. $div =
  1044. $this->fuckhtml
  1045. ->getElementsByTagName(
  1046. "div"
  1047. );
  1048. // get link
  1049. $links =
  1050. $this->fuckhtml
  1051. ->getElementsByTagName(
  1052. "a"
  1053. );
  1054. // get description of carousel sublink
  1055. $description =
  1056. $this->fuckhtml
  1057. ->getElementsByAttributeValue(
  1058. "role",
  1059. "heading",
  1060. $div
  1061. );
  1062. if(count($description) !== 0){
  1063. $description =
  1064. $this->titledots(
  1065. $this->fuckhtml
  1066. ->getTextContent(
  1067. $description[0]
  1068. )
  1069. );
  1070. }else{
  1071. $description = null;
  1072. }
  1073. $bottom =
  1074. $this->fuckhtml
  1075. ->getElementsByAttributeValue(
  1076. "style",
  1077. "z-index:2",
  1078. $div
  1079. );
  1080. $title = null;
  1081. $date = null;
  1082. if(count($bottom) !== 0){
  1083. $this->fuckhtml->load($bottom[0]);
  1084. $spans =
  1085. $this->fuckhtml
  1086. ->getElementsByTagName(
  1087. "span"
  1088. );
  1089. $title =
  1090. $this->fuckhtml
  1091. ->getTextContent(
  1092. $spans[0]
  1093. );
  1094. $date =
  1095. strtotime(
  1096. $this->fuckhtml
  1097. ->getTextContent(
  1098. $spans[count($spans) - 1]
  1099. )
  1100. );
  1101. }
  1102. $web["sublink"][] = [
  1103. "title" => $title,
  1104. "description" => $description,
  1105. "url" =>
  1106. $this->unshiturl(
  1107. $links[0]
  1108. ["attributes"]
  1109. ["href"]
  1110. ),
  1111. "date" => $date
  1112. ];
  1113. }
  1114. $out["web"][] = $web;
  1115. continue;
  1116. }
  1117. //
  1118. // get viewcount, time posted and follower count from <cite> tag
  1119. //
  1120. $cite =
  1121. $this->fuckhtml
  1122. ->getElementsByTagName(
  1123. "cite"
  1124. );
  1125. if(count($cite) !== 0){
  1126. $this->fuckhtml->load($cite[0]);
  1127. $spans =
  1128. $this->fuckhtml
  1129. ->getElementsByTagName("span");
  1130. if(count($spans) === 0){
  1131. $cites =
  1132. explode(
  1133. "·",
  1134. $this->fuckhtml
  1135. ->getTextContent(
  1136. $cite[0]
  1137. )
  1138. );
  1139. foreach($cites as $cite){
  1140. $cite = trim($cite);
  1141. if(
  1142. preg_match(
  1143. '/(.+) (views|followers|likes)$/',
  1144. $cite,
  1145. $match
  1146. )
  1147. ){
  1148. $web["table"][ucfirst($match[2])] =
  1149. $match[1];
  1150. }elseif(
  1151. preg_match(
  1152. '/ago$/',
  1153. $cite
  1154. )
  1155. ){
  1156. $web["date"] =
  1157. strtotime($cite);
  1158. }
  1159. }
  1160. }
  1161. // reset
  1162. $this->fuckhtml->load($result);
  1163. }
  1164. //
  1165. // attempt to fetch description cleanly
  1166. //
  1167. $description =
  1168. $this->fuckhtml
  1169. ->getElementsByAttributeValue(
  1170. "style",
  1171. "-webkit-line-clamp:2"
  1172. );
  1173. if(count($description) !== 0){
  1174. $web["description"] =
  1175. $this->titledots(
  1176. $this->fuckhtml
  1177. ->getTextContent(
  1178. $description[0]
  1179. )
  1180. );
  1181. }else{
  1182. // use ANOTHER method where the description is a header of the result
  1183. $description =
  1184. $this->fuckhtml
  1185. ->getElementsByAttributeValue(
  1186. "data-attrid",
  1187. "wa:/description"
  1188. );
  1189. if(count($description) !== 0){
  1190. // get date off that shit
  1191. $date =
  1192. $this->fuckhtml
  1193. ->getElementsByClassName(
  1194. $this->getstyle(
  1195. [
  1196. "font-size" => "12px",
  1197. "line-height" => "1.34",
  1198. "display" => "inline-block",
  1199. "font-family" => "google sans,arial,sans-serif",
  1200. "padding-right" => "0",
  1201. "white-space" => "nowrap"
  1202. ]
  1203. ),
  1204. "span"
  1205. );
  1206. if(count($date) !== 0){
  1207. $description[0]["innerHTML"] =
  1208. str_replace(
  1209. $date[0]["outerHTML"],
  1210. "",
  1211. $description[0]["innerHTML"]
  1212. );
  1213. $web["date"] =
  1214. strtotime(
  1215. $this->fuckhtml
  1216. ->getTextContent(
  1217. $date[0]
  1218. )
  1219. );
  1220. }
  1221. $web["description"] =
  1222. $this->fuckhtml
  1223. ->getTextContent(
  1224. $description[0]
  1225. );
  1226. }else{
  1227. // Yes.. You guessed it, use ANOTHER method to get descriptions
  1228. // off youtube containers
  1229. $description =
  1230. $this->fuckhtml
  1231. ->getElementsByClassName(
  1232. $this->getstyle(
  1233. [
  1234. "-webkit-box-orient" => "vertical",
  1235. "display" => "-webkit-box",
  1236. "font-size" => "14px",
  1237. "-webkit-line-clamp" => "2",
  1238. "line-height" => "22px",
  1239. "overflow" => "hidden",
  1240. "word-break" => "break-word",
  1241. "color" => "#4d5156"
  1242. ]
  1243. ),
  1244. "div"
  1245. );
  1246. if(count($description) !== 0){
  1247. // check for video duration
  1248. $duration =
  1249. $this->fuckhtml
  1250. ->getElementsByClassName(
  1251. $this->getstyle(
  1252. [
  1253. "background-color" => "rgba(0,0,0,0.6)",
  1254. "color" => "#fff",
  1255. "fill" => "#fff"
  1256. ]
  1257. ),
  1258. "div"
  1259. );
  1260. if(count($duration) !== 0){
  1261. $web["table"]["Duration"] =
  1262. $this->fuckhtml
  1263. ->getTextContent(
  1264. $duration[0]
  1265. );
  1266. }
  1267. $web["description"] =
  1268. $this->titledots(
  1269. html_entity_decode(
  1270. $this->fuckhtml
  1271. ->getTextContent(
  1272. $description[0]
  1273. )
  1274. )
  1275. );
  1276. // get author + time posted
  1277. $info =
  1278. $this->fuckhtml
  1279. ->getElementsByClassName(
  1280. $this->getstyle(
  1281. [
  1282. "color" => "var(" . $this->getcolorvar("#70757a") . ")",
  1283. "font-size" => "14px",
  1284. "line-height" => "20px",
  1285. "margin-top" => "12px"
  1286. ]
  1287. ),
  1288. "div"
  1289. );
  1290. if(count($info) !== 0){
  1291. $info =
  1292. explode(
  1293. "·",
  1294. $this->fuckhtml
  1295. ->getTextContent(
  1296. $info[0]
  1297. )
  1298. );
  1299. switch(count($info)){
  1300. case 3:
  1301. $web["table"]["Author"] = trim($info[1]);
  1302. $web["date"] = strtotime(trim($info[2]));
  1303. break;
  1304. case 2:
  1305. $web["date"] = strtotime(trim($info[1]));
  1306. break;
  1307. }
  1308. }
  1309. }
  1310. }
  1311. }
  1312. //
  1313. // get categories of content within the search result
  1314. //
  1315. $cats =
  1316. $this->fuckhtml
  1317. ->getElementsByAttributeName(
  1318. "data-sncf",
  1319. "div"
  1320. );
  1321. foreach($cats as $cat){
  1322. $this->fuckhtml->load($cat);
  1323. // detect image category
  1324. $images =
  1325. $this->fuckhtml
  1326. ->getElementsByTagName(
  1327. "img"
  1328. );
  1329. if(count($images) !== 0){
  1330. foreach($images as $image){
  1331. if(isset($image["attributes"]["id"])){
  1332. // we found an image
  1333. if(isset($image["attributes"]["width"])){
  1334. $width = (int)$image["attributes"]["width"];
  1335. if($width == 110){
  1336. $ratio = "1:1";
  1337. }elseif($width > 110){
  1338. $ratio = "16:9";
  1339. }else{
  1340. $ratio = "9:16";
  1341. }
  1342. }else{
  1343. $ratio = "1:1";
  1344. }
  1345. $web["thumb"] = [
  1346. "url" => $this->getdimg($image["attributes"]["id"]),
  1347. "ratio" => $ratio
  1348. ];
  1349. continue 2;
  1350. }
  1351. }
  1352. }
  1353. // Detect rating
  1354. $spans_unfiltered =
  1355. $this->fuckhtml
  1356. ->getElementsByTagName(
  1357. "span"
  1358. );
  1359. $spans =
  1360. $this->fuckhtml
  1361. ->getElementsByAttributeName(
  1362. "aria-label",
  1363. $spans_unfiltered
  1364. );
  1365. foreach($spans as $span){
  1366. if(
  1367. preg_match(
  1368. '/^Rated/',
  1369. $span["attributes"]["aria-label"]
  1370. )
  1371. ){
  1372. // found rating
  1373. // scrape rating
  1374. preg_match(
  1375. '/([0-9.]+).*([0-9.]+)/',
  1376. $span["attributes"]["aria-label"],
  1377. $rating
  1378. );
  1379. if(isset($rating[1])){
  1380. $web["table"]["Rating"] =
  1381. $rating[1] . "/" . $rating[2];
  1382. }
  1383. $has_seen_reviews = 0;
  1384. foreach($spans_unfiltered as $span_unfiltered){
  1385. if(
  1386. preg_match(
  1387. '/([0-9,.]+) +([A-z]+)$/',
  1388. $this->fuckhtml
  1389. ->getTextContent(
  1390. $span_unfiltered
  1391. ),
  1392. $votes
  1393. )
  1394. ){
  1395. $has_seen_reviews++;
  1396. $web["table"][ucfirst($votes[2])] = $votes[1];
  1397. continue;
  1398. }
  1399. $text =
  1400. $this->fuckhtml
  1401. ->getTextContent(
  1402. $span_unfiltered
  1403. );
  1404. if(
  1405. $text == "&nbsp;&nbsp;&nbsp;" ||
  1406. $text == ""
  1407. ){
  1408. break;
  1409. }
  1410. switch($has_seen_reviews){
  1411. case 1:
  1412. // scrape price
  1413. $web["table"]["Price"] = $text;
  1414. $has_seen_reviews++;
  1415. break;
  1416. case 2:
  1417. // scrape platform
  1418. $web["table"]["Platform"] = $text;
  1419. $has_seen_reviews++;
  1420. break;
  1421. case 3:
  1422. // Scrape type
  1423. $web["table"]["Medium"] = $text;
  1424. break;
  1425. }
  1426. }
  1427. continue 2;
  1428. }
  1429. }
  1430. // check if its a table of small sublinks
  1431. $table =
  1432. $this->fuckhtml
  1433. ->getElementsByClassName(
  1434. $this->getstyle(
  1435. [
  1436. "display" => "table",
  1437. "white-space" => "nowrap",
  1438. "margin" => "5px 0",
  1439. "line-height" => "1.58",
  1440. "color" => "var(" . $this->getcolorvar("#70757a") . ")"
  1441. ]
  1442. ),
  1443. "div"
  1444. );
  1445. if(count($table) !== 0){
  1446. $this->fuckhtml->load($table[0]);
  1447. $rows =
  1448. $this->fuckhtml
  1449. ->getElementsByClassName(
  1450. $this->getstyle(
  1451. [
  1452. "display" => "flex",
  1453. "white-space" => "normal"
  1454. ]
  1455. ),
  1456. "div"
  1457. );
  1458. foreach($rows as $row){
  1459. $this->fuckhtml->load($row);
  1460. $sublink = [
  1461. "title" => null,
  1462. "description" => null,
  1463. "url" => null,
  1464. "date" => null
  1465. ];
  1466. $link =
  1467. $this->fuckhtml
  1468. ->getElementsByTagName(
  1469. "a"
  1470. )[0];
  1471. $sublink["title"] =
  1472. $this->titledots(
  1473. $this->fuckhtml
  1474. ->getTextContent(
  1475. $link
  1476. )
  1477. );
  1478. $sublink["url"] =
  1479. $this->unshiturl(
  1480. $link
  1481. ["attributes"]
  1482. ["href"]
  1483. );
  1484. $row["innerHTML"] =
  1485. str_replace(
  1486. $link["outerHTML"],
  1487. "",
  1488. $row["innerHTML"]
  1489. );
  1490. $this->fuckhtml->load($row);
  1491. $spans =
  1492. $this->fuckhtml
  1493. ->getElementsByTagName(
  1494. "span"
  1495. );
  1496. foreach($spans as $span){
  1497. $text =
  1498. $this->fuckhtml
  1499. ->getTextContent(
  1500. $span
  1501. );
  1502. if(
  1503. preg_match(
  1504. '/answers?$/',
  1505. $text
  1506. )
  1507. ){
  1508. $sublink["description"] =
  1509. $text;
  1510. continue;
  1511. }
  1512. $time = strtotime($text);
  1513. if($time !== false){
  1514. $sublink["date"] = $time;
  1515. }
  1516. }
  1517. $web["sublink"][] = $sublink;
  1518. }
  1519. // reset
  1520. $this->fuckhtml->load($cat);
  1521. continue;
  1522. }
  1523. // check if its an answer header
  1524. $answer_header =
  1525. $this->fuckhtml
  1526. ->getElementsByClassName(
  1527. $this->getstyle(
  1528. [
  1529. "overflow" => "hidden",
  1530. "text-overflow" => "ellipsis"
  1531. ]
  1532. ),
  1533. "span"
  1534. );
  1535. if(count($answer_header) !== 0){
  1536. $link =
  1537. $this->fuckhtml
  1538. ->getElementsByTagName(
  1539. "a"
  1540. );
  1541. $cat["innerHTML"] =
  1542. str_replace(
  1543. $link[0]["outerHTML"],
  1544. "",
  1545. $cat["innerHTML"]
  1546. );
  1547. $web["sublink"][] = [
  1548. "title" =>
  1549. $this->fuckhtml
  1550. ->getTextContent(
  1551. $link[0]
  1552. ),
  1553. "description" =>
  1554. $this->titledots(
  1555. trim(
  1556. str_replace(
  1557. "\xc2\xa0",
  1558. " ",
  1559. html_entity_decode(
  1560. $this->fuckhtml
  1561. ->getTextContent(
  1562. $cat
  1563. )
  1564. )
  1565. ),
  1566. " ·"
  1567. )
  1568. ),
  1569. "url" =>
  1570. $this->fuckhtml
  1571. ->getTextContent(
  1572. $link[0]
  1573. ["attributes"]
  1574. ["href"]
  1575. ),
  1576. "date" => null
  1577. ];
  1578. continue;
  1579. }
  1580. // check if its list of small sublinks
  1581. $urls =
  1582. $this->fuckhtml
  1583. ->getElementsByTagName(
  1584. "a"
  1585. );
  1586. if(count($urls) !== 0){
  1587. // found small links
  1588. foreach($urls as $url){
  1589. $target =
  1590. $this->fuckhtml
  1591. ->getTextContent(
  1592. $url
  1593. ["attributes"]
  1594. ["href"]
  1595. );
  1596. if(
  1597. !preg_match(
  1598. '/^http/',
  1599. $target
  1600. )
  1601. ){
  1602. continue;
  1603. }
  1604. $web["sublink"][] = [
  1605. "title" =>
  1606. $this->titledots(
  1607. $this->fuckhtml
  1608. ->getTextContent(
  1609. $url
  1610. )
  1611. ),
  1612. "description" => null,
  1613. "url" => $target,
  1614. "date" => null
  1615. ];
  1616. }
  1617. continue;
  1618. }
  1619. // we probed everything, assume this is the description
  1620. // if we didn't find one cleanly previously
  1621. if($web["description"] === null){
  1622. $web["description"] =
  1623. $this->titledots(
  1624. $this->fuckhtml
  1625. ->getTextContent(
  1626. $cat
  1627. )
  1628. );
  1629. }
  1630. }
  1631. // check if description contains date
  1632. $description = explode("—", $web["description"], 2);
  1633. if(
  1634. count($description) === 2 &&
  1635. strlen($description[0]) <= 20
  1636. ){
  1637. $date = strtotime($description[0]);
  1638. if($date !== false){
  1639. $web["date"] = $date;
  1640. $web["description"] = ltrim($description[1]);
  1641. }
  1642. }
  1643. // fetch youtube thumbnail
  1644. $thumbnail =
  1645. $this->fuckhtml
  1646. ->getElementsByClassName(
  1647. $this->getstyle(
  1648. [
  1649. "border-radius" => "8px",
  1650. "height" => "fit-content",
  1651. "justify-content" => "center",
  1652. "margin-right" => "20px",
  1653. "margin-top" => "4px",
  1654. "position" => "relative",
  1655. "width" => "fit-content"
  1656. ]
  1657. ),
  1658. "div"
  1659. );
  1660. if(count($thumbnail) !== 0){
  1661. // load thumbnail container
  1662. $this->fuckhtml->load($thumbnail[0]);
  1663. $image =
  1664. $this->fuckhtml
  1665. ->getElementsByTagName(
  1666. "img"
  1667. );
  1668. if(
  1669. count($image) !== 0 &&
  1670. isset($image[0]["attributes"]["id"])
  1671. ){
  1672. $web["thumb"] = [
  1673. "url" =>
  1674. $this->unshit_thumb(
  1675. $this->getdimg(
  1676. $image[0]["attributes"]["id"]
  1677. )
  1678. ),
  1679. "ratio" => "16:9"
  1680. ];
  1681. }
  1682. // reset
  1683. $this->fuckhtml->load($result);
  1684. }
  1685. $out["web"][] = $web;
  1686. }
  1687. // reset
  1688. $this->fuckhtml->load($result_div);
  1689. //
  1690. // Get instant answers
  1691. //
  1692. $answer_containers =
  1693. $this->fuckhtml
  1694. ->getElementsByClassName(
  1695. $this->getstyle(
  1696. [
  1697. "padding-left" => "0px",
  1698. "padding-right" => "0px"
  1699. ]
  1700. ),
  1701. "div"
  1702. );
  1703. $date_class =
  1704. $this->getstyle(
  1705. [
  1706. "font-size" => "12px",
  1707. "line-height" => "1.34",
  1708. "display" => "inline-block",
  1709. "font-family" => "google sans,arial,sans-serif",
  1710. "padding-right" => "0",
  1711. "white-space" => "nowrap"
  1712. ]
  1713. );
  1714. foreach($answer_containers as $container){
  1715. $this->fuckhtml->load($container);
  1716. $web = [
  1717. "title" => null,
  1718. "description" => null,
  1719. "url" => null,
  1720. "date" => null,
  1721. "type" => "web",
  1722. "thumb" => [
  1723. "url" => null,
  1724. "ratio" => null
  1725. ],
  1726. "sublink" => [],
  1727. "table" => []
  1728. ];
  1729. $answers =
  1730. $this->fuckhtml
  1731. ->getElementsByAttributeName(
  1732. "aria-controls",
  1733. "div"
  1734. );
  1735. $item_insert_pos = 1;
  1736. foreach($answers as $answer){
  1737. $out["related"][] =
  1738. $this->fuckhtml
  1739. ->getTextContent(
  1740. $answer
  1741. );
  1742. if(
  1743. isset(
  1744. $this->blobs[
  1745. $answer
  1746. ["attributes"]
  1747. ["aria-controls"]
  1748. ]
  1749. )
  1750. ){
  1751. $this->fuckhtml->load(
  1752. $this->blobs[
  1753. $answer
  1754. ["attributes"]
  1755. ["aria-controls"]
  1756. ]
  1757. );
  1758. $divs =
  1759. $this->fuckhtml
  1760. ->getElementsByAttributeName(
  1761. "id",
  1762. "div"
  1763. );
  1764. foreach($divs as $div){
  1765. if(
  1766. !isset(
  1767. $this->blobs[
  1768. $div
  1769. ["attributes"]
  1770. ["id"]
  1771. ]
  1772. )
  1773. ){
  1774. continue;
  1775. }
  1776. $this->fuckhtml->load(
  1777. $this->blobs[
  1778. $div
  1779. ["attributes"]
  1780. ["id"]
  1781. ]
  1782. );
  1783. // get url
  1784. $as =
  1785. $this->fuckhtml
  1786. ->getElementsByTagName(
  1787. "a"
  1788. );
  1789. if(count($as) !== 0){
  1790. $web["url"] =
  1791. $this->unshiturl(
  1792. $as[0]["attributes"]["href"]
  1793. );
  1794. // skip entries that redirect to a search
  1795. if(
  1796. !preg_match(
  1797. '/^http/',
  1798. $web["url"]
  1799. )
  1800. ){
  1801. continue 3;
  1802. }
  1803. }
  1804. // get title
  1805. $h3 =
  1806. $this->fuckhtml
  1807. ->getElementsByTagName(
  1808. "h3"
  1809. );
  1810. if(count($h3) !== 0){
  1811. $web["title"] =
  1812. $this->titledots(
  1813. $this->fuckhtml
  1814. ->getTextContent(
  1815. $h3[0]
  1816. )
  1817. );
  1818. }
  1819. $description =
  1820. $this->fuckhtml
  1821. ->getElementsByAttributeValue(
  1822. "data-attrid",
  1823. "wa:/description",
  1824. "div"
  1825. );
  1826. if(count($description) !== 0){
  1827. // check for date
  1828. $this->fuckhtml->load($description[0]);
  1829. $date =
  1830. $this->fuckhtml
  1831. ->getElementsByClassName(
  1832. $date_class,
  1833. "span"
  1834. );
  1835. if(count($date) !== 0){
  1836. $description[0]["innerHTML"] =
  1837. str_replace(
  1838. $date[0]["outerHTML"],
  1839. "",
  1840. $description[0]["innerHTML"]
  1841. );
  1842. $web["date"] =
  1843. strtotime(
  1844. $this->fuckhtml
  1845. ->getTextContent(
  1846. $date[0]
  1847. )
  1848. );
  1849. }
  1850. $web["description"] =
  1851. ltrim(
  1852. $this->fuckhtml
  1853. ->getTextContent(
  1854. $description[0]
  1855. ),
  1856. ": "
  1857. );
  1858. }
  1859. }
  1860. foreach($out["web"] as $item){
  1861. if($item["url"] == $web["url"]){
  1862. continue 2;
  1863. }
  1864. }
  1865. array_splice($out["web"], $item_insert_pos, 0, [$web]);
  1866. $item_insert_pos++;
  1867. }
  1868. }
  1869. }
  1870. // reset
  1871. $this->fuckhtml->load($result_div);
  1872. //
  1873. // Scrape word definition
  1874. //
  1875. $definition_container =
  1876. $this->fuckhtml
  1877. ->getElementsByClassName(
  1878. "lr_container",
  1879. "div"
  1880. );
  1881. if(count($definition_container) !== 0){
  1882. $this->fuckhtml->load($definition_container[0]);
  1883. // get header
  1884. $header =
  1885. $this->fuckhtml
  1886. ->getElementsByAttributeValue(
  1887. "data-attrid",
  1888. "EntryHeader",
  1889. "div"
  1890. );
  1891. if(count($header) !== 0){
  1892. $description = [];
  1893. $this->fuckhtml->load($header[0]);
  1894. $title_div =
  1895. $this->fuckhtml
  1896. ->getElementsByClassName(
  1897. $this->getstyle(
  1898. [
  1899. "font-family" => "google sans,arial,sans-serif",
  1900. "font-size" => "28px",
  1901. "line-height" => "36px"
  1902. ]
  1903. )
  1904. );
  1905. if(count($title_div) !== 0){
  1906. $title =
  1907. $this->fuckhtml
  1908. ->getTextContent(
  1909. $title_div[0]
  1910. );
  1911. }else{
  1912. $title = "Word definition";
  1913. }
  1914. $subtext_div =
  1915. $this->fuckhtml
  1916. ->getElementsByClassName(
  1917. $this->getstyle(
  1918. [
  1919. "font-family" => "arial,sans-serif",
  1920. "font-size" => "14px",
  1921. "line-height" => "22px"
  1922. ]
  1923. ),
  1924. "span"
  1925. );
  1926. if(count($subtext_div) !== 0){
  1927. $description[] = [
  1928. "type" => "quote",
  1929. "value" =>
  1930. $this->fuckhtml
  1931. ->getTextContent(
  1932. $subtext_div[0]
  1933. )
  1934. ];
  1935. }
  1936. // get audio
  1937. $audio =
  1938. $this->fuckhtml
  1939. ->getElementsByTagName(
  1940. "audio"
  1941. );
  1942. if(count($audio) !== 0){
  1943. $this->fuckhtml->load($audio[0]);
  1944. $source =
  1945. $this->fuckhtml
  1946. ->getElementsByTagName(
  1947. "source"
  1948. );
  1949. if(count($source) !== 0){
  1950. $description[] = [
  1951. "type" => "audio",
  1952. "url" =>
  1953. preg_replace(
  1954. '/^\/\//',
  1955. "https://",
  1956. $this->fuckhtml
  1957. ->getTextContent(
  1958. $source[0]
  1959. ["attributes"]
  1960. ["src"]
  1961. )
  1962. )
  1963. ];
  1964. }
  1965. }
  1966. // remove header to avoid confusion
  1967. $definition_container[0]["innerHTML"] =
  1968. str_replace(
  1969. $header[0]["outerHTML"],
  1970. "",
  1971. $definition_container[0]["innerHTML"]
  1972. );
  1973. // reset
  1974. $this->fuckhtml->load($definition_container[0]);
  1975. $vmods =
  1976. $this->fuckhtml
  1977. ->getElementsByClassName(
  1978. "vmod",
  1979. "div"
  1980. );
  1981. foreach($vmods as $category){
  1982. if(
  1983. !isset(
  1984. $category
  1985. ["attributes"]
  1986. ["data-topic"]
  1987. ) ||
  1988. $category
  1989. ["attributes"]
  1990. ["class"] != "vmod"
  1991. ){
  1992. continue;
  1993. }
  1994. $this->fuckhtml->load($category);
  1995. // get category type
  1996. $type =
  1997. $this->fuckhtml
  1998. ->getElementsByTagName(
  1999. "i"
  2000. );
  2001. if(count($type) !== 0){
  2002. $description[] = [
  2003. "type" => "title",
  2004. "value" =>
  2005. $this->fuckhtml
  2006. ->getTextContent(
  2007. $type[0]
  2008. )
  2009. ];
  2010. }
  2011. // get heading text
  2012. $headings =
  2013. $this->fuckhtml
  2014. ->getElementsByClassName(
  2015. "xpdxpnd",
  2016. "div"
  2017. );
  2018. foreach($headings as $heading){
  2019. $description[] = [
  2020. "type" => "quote",
  2021. "value" =>
  2022. $this->fuckhtml
  2023. ->getTextContent(
  2024. $heading
  2025. )
  2026. ];
  2027. }
  2028. $definitions =
  2029. $this->fuckhtml
  2030. ->getElementsByAttributeValue(
  2031. "data-attrid",
  2032. "SenseDefinition",
  2033. "div"
  2034. );
  2035. $i = 1;
  2036. $text = [];
  2037. foreach($definitions as $definition){
  2038. $text[] =
  2039. $i . ". " .
  2040. $this->fuckhtml
  2041. ->getTextContent(
  2042. $definition
  2043. );
  2044. $i++;
  2045. }
  2046. if(count($text) !== 0){
  2047. $description[] = [
  2048. "type" => "text",
  2049. "value" =>
  2050. implode("\n", $text)
  2051. ];
  2052. }
  2053. }
  2054. $out["answer"][] = [
  2055. "title" => $title,
  2056. "description" => $description,
  2057. "url" => null,
  2058. "thumb" => null,
  2059. "table" => [],
  2060. "sublink" => []
  2061. ];
  2062. }
  2063. // reset
  2064. $this->fuckhtml->load($result_div);
  2065. }
  2066. //
  2067. // scrape elements with a g-section-with-header
  2068. // includes: images, news carousels
  2069. //
  2070. $g_sections =
  2071. $this->fuckhtml
  2072. ->getElementsByTagName(
  2073. "g-section-with-header"
  2074. );
  2075. if(count($g_sections) !== 0){
  2076. foreach($g_sections as $g_section){
  2077. // parse elements with a g-section-with-header
  2078. $this->fuckhtml->load($g_section);
  2079. $div_title =
  2080. $this->fuckhtml
  2081. ->getElementsByClassName(
  2082. "a-no-hover-decoration",
  2083. "a"
  2084. );
  2085. if(count($div_title) !== 0){
  2086. // title detected, skip
  2087. continue;
  2088. }
  2089. // no title detected: detect news container
  2090. $news =
  2091. $this->fuckhtml
  2092. ->getElementsByClassName(
  2093. $this->getstyle(
  2094. [
  2095. "outline-offset" => "-1px",
  2096. "outline-width" => "1px",
  2097. "display" => "flex",
  2098. "flex-direction" => "column",
  2099. "flex-grow" => "1"
  2100. ]
  2101. )
  2102. );
  2103. foreach($news as $new){
  2104. $this->fuckhtml->load($new);
  2105. $image =
  2106. $this->fuckhtml
  2107. ->getElementsByAttributeName(
  2108. "id",
  2109. "img"
  2110. );
  2111. if(
  2112. count($image) !== 0 &&
  2113. !(
  2114. isset($image[0]["attributes"]["style"]) &&
  2115. strpos(
  2116. $image[0]["attributes"]["style"],
  2117. "height:18px"
  2118. ) !== false
  2119. )
  2120. ){
  2121. $thumb = [
  2122. "url" =>
  2123. $this->getdimg(
  2124. $image[0]
  2125. ["attributes"]
  2126. ["id"]
  2127. ),
  2128. "ratio" => "1:1"
  2129. ];
  2130. }
  2131. $title =
  2132. $this->titledots(
  2133. $this->fuckhtml
  2134. ->getTextContent(
  2135. $this->fuckhtml
  2136. ->getElementsByAttributeValue(
  2137. "role",
  2138. "heading",
  2139. "div"
  2140. )[0]
  2141. )
  2142. );
  2143. $date_div =
  2144. $this->fuckhtml
  2145. ->getElementsByAttributeName(
  2146. "style",
  2147. "div"
  2148. );
  2149. if(count($date_div) !== 0){
  2150. foreach($date_div as $div){
  2151. if(
  2152. strpos(
  2153. $div["attributes"]["style"],
  2154. "bottom:"
  2155. ) !== false
  2156. ){
  2157. $date =
  2158. strtotime(
  2159. $this->fuckhtml
  2160. ->getTextContent(
  2161. $div
  2162. )
  2163. );
  2164. break;
  2165. }
  2166. }
  2167. }else{
  2168. $date = null;
  2169. }
  2170. $out["news"][] = [
  2171. "title" => $title,
  2172. "description" => null,
  2173. "date" => $date,
  2174. "thumb" => $thumb,
  2175. "url" =>
  2176. $this->fuckhtml
  2177. ->getTextContent(
  2178. $new
  2179. ["attributes"]
  2180. ["href"]
  2181. )
  2182. ];
  2183. }
  2184. }
  2185. // reset
  2186. $this->fuckhtml->load($result_div);
  2187. }
  2188. //
  2189. // Parse images (carousel, left hand-side)
  2190. //
  2191. $image_carousels =
  2192. $this->fuckhtml
  2193. ->getElementsByAttributeValue(
  2194. "id",
  2195. "media_result_group",
  2196. "div"
  2197. );
  2198. if(count($image_carousels) !== 0){
  2199. foreach($image_carousels as $image_carousel){
  2200. $this->fuckhtml->load($image_carousel);
  2201. // get related searches in image carousel
  2202. $relateds =
  2203. $this->fuckhtml
  2204. ->getElementsByClassName(
  2205. $this->getstyle(
  2206. [
  2207. "display" => "inline-block",
  2208. "margin-right" => "6px",
  2209. "outline" => "none",
  2210. "padding" => "6px 0"
  2211. ],
  2212. "a"
  2213. )
  2214. );
  2215. foreach($relateds as $related){
  2216. if(!isset($related["innerHTML"])){
  2217. // found an image
  2218. continue;
  2219. }
  2220. $text =
  2221. $this->fuckhtml
  2222. ->getTextContent(
  2223. $related
  2224. );
  2225. if($text != ""){
  2226. $out["related"][] = $text;
  2227. }
  2228. }
  2229. $div =
  2230. $this->fuckhtml
  2231. ->getElementsByTagName(
  2232. "div"
  2233. );
  2234. // get loaded images
  2235. $images =
  2236. $this->fuckhtml
  2237. ->getElementsByClassName(
  2238. "ivg-i",
  2239. $div
  2240. );
  2241. foreach($images as $image){
  2242. $this->fuckhtml->load($image);
  2243. $img_tags =
  2244. $this->fuckhtml
  2245. ->getElementsByTagName(
  2246. "img"
  2247. );
  2248. if(
  2249. !isset($image["attributes"]["data-docid"]) ||
  2250. !isset($this->image_arr[$image["attributes"]["data-docid"]])
  2251. ){
  2252. continue;
  2253. }
  2254. // search for the right image tag
  2255. $image_tag = false;
  2256. foreach($img_tags as $img){
  2257. if(
  2258. isset(
  2259. $img
  2260. ["attributes"]
  2261. ["alt"]
  2262. ) &&
  2263. trim(
  2264. $img
  2265. ["attributes"]
  2266. ["alt"]
  2267. ) != ""
  2268. ){
  2269. $image_tag = $img;
  2270. break;
  2271. }
  2272. }
  2273. if($image_tag === false){
  2274. continue;
  2275. }
  2276. $out["image"][] = [
  2277. "title" =>
  2278. $this->titledots(
  2279. $this->fuckhtml
  2280. ->getTextContent(
  2281. $image_tag
  2282. ["attributes"]
  2283. ["alt"]
  2284. )
  2285. ),
  2286. "source" =>
  2287. $this->image_arr[
  2288. $image
  2289. ["attributes"]
  2290. ["data-docid"]
  2291. ],
  2292. "url" =>
  2293. $this->fuckhtml
  2294. ->getTextContent(
  2295. $image
  2296. ["attributes"]
  2297. ["data-lpage"]
  2298. )
  2299. ];
  2300. }
  2301. // get unloaded javascript images
  2302. $images_js_sel =
  2303. $this->fuckhtml
  2304. ->getElementsByAttributeName(
  2305. "id",
  2306. $div
  2307. );
  2308. $loaded = [];
  2309. foreach($images_js_sel as $sel){
  2310. if(
  2311. !isset($this->blobs[$sel["attributes"]["id"]]) ||
  2312. in_array((string)$sel["attributes"]["id"], $loaded, true)
  2313. ){
  2314. // not an unloaded javascript image
  2315. continue;
  2316. }
  2317. $loaded[] = $sel["attributes"]["id"];
  2318. // get yet another javascript component
  2319. $this->fuckhtml->load($this->blobs[$sel["attributes"]["id"]]);
  2320. // get js node: contains title & url
  2321. $js_node =
  2322. $this->fuckhtml
  2323. ->getElementsByTagName(
  2324. "div"
  2325. )[0];
  2326. if(!isset($this->blobs[$js_node["attributes"]["id"]])){
  2327. // did not find refer id
  2328. continue;
  2329. }
  2330. // load second javascript component
  2331. $this->fuckhtml->load($this->blobs[$js_node["attributes"]["id"]]);
  2332. // get title from image alt text.
  2333. // data-src from this image is cropped, ignore it..
  2334. $img =
  2335. $this->fuckhtml
  2336. ->getElementsByTagName(
  2337. "img"
  2338. )[0];
  2339. $out["image"][] = [
  2340. "title" =>
  2341. $this->fuckhtml
  2342. ->getTextContent(
  2343. $img["attributes"]["alt"]
  2344. ),
  2345. "source" =>
  2346. $this->image_arr[
  2347. $js_node["attributes"]["data-docid"]
  2348. ],
  2349. "url" =>
  2350. $this->fuckhtml
  2351. ->getTextContent(
  2352. $js_node["attributes"]["data-lpage"]
  2353. )
  2354. ];
  2355. }
  2356. }
  2357. // reset
  2358. $this->fuckhtml->load($result_div);
  2359. }
  2360. //
  2361. // Parse videos
  2362. //
  2363. $this->fuckhtml->load($result_div);
  2364. $videos =
  2365. $this->fuckhtml
  2366. ->getElementsByAttributeName(
  2367. "data-vid",
  2368. "div"
  2369. );
  2370. foreach($videos as $video){
  2371. $this->fuckhtml->load($video);
  2372. // get url
  2373. $url =
  2374. $this->fuckhtml
  2375. ->getTextContent(
  2376. $video
  2377. ["attributes"]
  2378. ["data-surl"]
  2379. );
  2380. foreach($out["web"] as $link){
  2381. if($link["url"] == $url){
  2382. // ignore if we already have the video in $out["web"]
  2383. continue 2;
  2384. }
  2385. }
  2386. // get heading element
  2387. $heading =
  2388. $this->fuckhtml
  2389. ->getElementsByAttributeValue(
  2390. "role",
  2391. "heading",
  2392. "div"
  2393. );
  2394. if(count($heading) === 0){
  2395. // no heading, fuck this.
  2396. continue;
  2397. }
  2398. // get thumbnail before loading heading object
  2399. $image =
  2400. $this->fuckhtml
  2401. ->getElementsByAttributeName(
  2402. "id",
  2403. "img"
  2404. );
  2405. if(count($image) !== 0){
  2406. $thumb = [
  2407. "url" => $this->getdimg($image[0]["attributes"]["id"]),
  2408. "ratio" => "16:9"
  2409. ];
  2410. }else{
  2411. $thumb = [
  2412. "url" => null,
  2413. "ratio" => null
  2414. ];
  2415. }
  2416. // get duration
  2417. $duration_div =
  2418. $this->fuckhtml
  2419. ->getElementsByClassName(
  2420. $this->getstyle(
  2421. [
  2422. "border-radius" => "10px",
  2423. "font-family" => "arial,sans-serif-medium,sans-serif",
  2424. "font-size" => "12px",
  2425. "line-height" => "16px",
  2426. "padding-block" => "2px",
  2427. "padding-inline" => "8px"
  2428. ]
  2429. ),
  2430. "div"
  2431. );
  2432. if(count($duration_div) !== 0){
  2433. $duration =
  2434. $this->hms2int(
  2435. $this->fuckhtml
  2436. ->getTextContent(
  2437. $duration_div[0]
  2438. )
  2439. );
  2440. }else{
  2441. // check if its a livestream
  2442. $duration =
  2443. $this->fuckhtml
  2444. ->getElementsByClassName(
  2445. $this->getstyle(
  2446. [
  2447. "background-color" => "#d93025",
  2448. "border-radius" => "10px",
  2449. "color" => "#fff",
  2450. "font-family" => "arial,sans-serif-medium,sans-serif",
  2451. "font-size" => "12px",
  2452. "line-height" => "16px",
  2453. "padding-block" => "2px",
  2454. "padding-inline" => "8px"
  2455. ]
  2456. ),
  2457. "span"
  2458. );
  2459. if(count($duration) !== 0){
  2460. $duration = "_LIVE";
  2461. }else{
  2462. $duration = null;
  2463. }
  2464. }
  2465. // load heading
  2466. $this->fuckhtml->load($heading[0]);
  2467. // get title
  2468. $title =
  2469. $this->fuckhtml
  2470. ->getElementsByClassName(
  2471. $this->getstyle(
  2472. [
  2473. "font-family" => "arial,sans-serif",
  2474. "font-size" => "16px",
  2475. "font-weight" => "400",
  2476. "line-height" => "24px"
  2477. ]
  2478. ),
  2479. "div"
  2480. );
  2481. if(count($title) === 0){
  2482. // ?? no title
  2483. continue;
  2484. }
  2485. $title =
  2486. $this->titledots(
  2487. $this->fuckhtml
  2488. ->getTextContent(
  2489. $title[0]
  2490. )
  2491. );
  2492. // get date
  2493. $date_div =
  2494. $this->fuckhtml
  2495. ->getElementsByClassName(
  2496. $this->getstyle(
  2497. [
  2498. "color" => "var(" . $this->getcolorvar("#70757a") . ")",
  2499. "font-size" => "14px"
  2500. ]
  2501. ),
  2502. "div"
  2503. );
  2504. if(count($date_div) !== 0){
  2505. $date = strtotime(
  2506. $this->fuckhtml
  2507. ->getTextContent(
  2508. $date_div[0]
  2509. )
  2510. );
  2511. if($date === false){
  2512. // failed to parse date
  2513. $date = null;
  2514. }
  2515. }else{
  2516. $date = null;
  2517. }
  2518. $out["video"][] = [
  2519. "title" => $title,
  2520. "description" => null,
  2521. "date" => $date,
  2522. "duration" => $duration,
  2523. "views" => null,
  2524. "thumb" => $thumb,
  2525. "url" => $url
  2526. ];
  2527. }
  2528. //
  2529. // Parse featured results (which contain images, fuck the rest desu)
  2530. //
  2531. $this->fuckhtml->load($html);
  2532. $top =
  2533. $this->fuckhtml
  2534. ->getElementsByAttributeValue(
  2535. "aria-label",
  2536. "Featured results",
  2537. "div"
  2538. );
  2539. if(count($top) !== 0){
  2540. $this->fuckhtml->load($top[0]);
  2541. // get images
  2542. $grid =
  2543. $this->fuckhtml
  2544. ->getElementsByClassName(
  2545. $this->getstyle(
  2546. [
  2547. "border-radius" => "20px",
  2548. "display" => "grid",
  2549. "grid-gap" => "2px",
  2550. "grid-template-rows" => "repeat(2,minmax(0,1fr))",
  2551. "overflow" => "hidden",
  2552. "bottom" => "0",
  2553. "left" => "0",
  2554. "right" => "0",
  2555. "top" => "0",
  2556. "position" => "absolute",
  2557. ]
  2558. ),
  2559. "div"
  2560. );
  2561. if(count($grid) !== 0){
  2562. // we found image grid
  2563. $this->fuckhtml->load($grid[0]);
  2564. $images_div =
  2565. $this->fuckhtml
  2566. ->getElementsByAttributeName(
  2567. "data-attrid",
  2568. "div"
  2569. );
  2570. foreach($images_div as $image_div){
  2571. $this->fuckhtml->load($image_div);
  2572. $image =
  2573. $this->fuckhtml
  2574. ->getElementsByTagName(
  2575. "img"
  2576. );
  2577. if(
  2578. count($image) === 0 ||
  2579. !isset($image_div["attributes"]["data-docid"]) ||
  2580. !isset($this->image_arr[$image_div["attributes"]["data-docid"]])
  2581. ){
  2582. // ?? no image, continue
  2583. continue;
  2584. }
  2585. $out["image"][] = [
  2586. "title" =>
  2587. $this->titledots(
  2588. $this->fuckhtml
  2589. ->getTextContent(
  2590. $image[0]["attributes"]["alt"]
  2591. )
  2592. ),
  2593. "source" =>
  2594. $this->image_arr[
  2595. $image_div["attributes"]["data-docid"]
  2596. ],
  2597. "url" =>
  2598. $this->fuckhtml
  2599. ->getTextContent(
  2600. $image_div["attributes"]["data-lpage"]
  2601. )
  2602. ];
  2603. }
  2604. }
  2605. }
  2606. //
  2607. // craft $npt token
  2608. //
  2609. if(
  2610. $last_page === false &&
  2611. count($out["web"]) !== 0
  2612. ){
  2613. if(!isset($params["start"])){
  2614. $params["start"] = 20;
  2615. }else{
  2616. $params["start"] += 20;
  2617. }
  2618. $out["npt"] =
  2619. $this->backend
  2620. ->store(
  2621. json_encode($params),
  2622. $pagetype,
  2623. $proxy
  2624. );
  2625. }
  2626. //
  2627. // Parse right handside
  2628. //
  2629. $this->fuckhtml->load($html);
  2630. $rhs =
  2631. $this->fuckhtml
  2632. ->getElementById(
  2633. "rhs"
  2634. );
  2635. if($rhs === null){
  2636. return $out;
  2637. }
  2638. $this->fuckhtml->load($rhs);
  2639. // get images gallery
  2640. $image_gallery =
  2641. $this->fuckhtml
  2642. ->getElementsByAttributeValue(
  2643. "data-rc",
  2644. "ivg-i",
  2645. "div"
  2646. );
  2647. if(count($image_gallery) !== 0){
  2648. $this->fuckhtml->load($image_gallery[0]);
  2649. // get images
  2650. $images_div =
  2651. $this->fuckhtml
  2652. ->getElementsByClassName(
  2653. "ivg-i",
  2654. "div"
  2655. );
  2656. foreach($images_div as $image_div){
  2657. $this->fuckhtml->load($image_div);
  2658. $image =
  2659. $this->fuckhtml
  2660. ->getElementsByTagName(
  2661. "img"
  2662. );
  2663. if(
  2664. count($image) === 0 ||
  2665. !isset(
  2666. $this->image_arr[
  2667. $image_div
  2668. ["attributes"]
  2669. ["data-docid"]
  2670. ]
  2671. )
  2672. ){
  2673. continue;
  2674. }
  2675. foreach($out["image"] as $existing_image){
  2676. // might already exist
  2677. if(
  2678. $existing_image["source"][1]["url"] ==
  2679. $this->image_arr[
  2680. $image_div
  2681. ["attributes"]
  2682. ["data-docid"]
  2683. ][1]["url"]
  2684. ){
  2685. continue 2;
  2686. }
  2687. }
  2688. $out["image"][] = [
  2689. "title" =>
  2690. $this->titledots(
  2691. $this->fuckhtml
  2692. ->getTextContent(
  2693. $image[0]
  2694. ["attributes"]
  2695. ["alt"]
  2696. )
  2697. ),
  2698. "source" =>
  2699. $this->image_arr[
  2700. $image_div
  2701. ["attributes"]
  2702. ["data-docid"]
  2703. ],
  2704. "url" =>
  2705. $this->fuckhtml
  2706. ->getTextContent(
  2707. $image_div
  2708. ["attributes"]
  2709. ["data-lpage"]
  2710. )
  2711. ];
  2712. }
  2713. // reset
  2714. $this->fuckhtml->load($rhs);
  2715. }
  2716. // get header container
  2717. $header =
  2718. $this->fuckhtml
  2719. ->getElementsByClassName(
  2720. $this->getstyle(
  2721. [
  2722. "padding" => "0 0 16px 20px",
  2723. "display" => "flex"
  2724. ]
  2725. ),
  2726. "div"
  2727. );
  2728. // stop parsing wikipedia heads if there isn't a header
  2729. $description = [];
  2730. $title = "About";
  2731. if(count($header) !== 0){
  2732. $this->fuckhtml->load($header[0]);
  2733. // g-snackbar-action present: we found a button instead
  2734. if(
  2735. count(
  2736. $this->fuckhtml
  2737. ->getElementsByTagName(
  2738. "g-snackbar-action"
  2739. )
  2740. ) !== 0
  2741. ){
  2742. $title_tag =
  2743. $this->fuckhtml
  2744. ->getElementsByAttributeValue(
  2745. "data-attrid",
  2746. "title",
  2747. "div"
  2748. );
  2749. if(count($title_tag) !== 0){
  2750. $title =
  2751. $this->fuckhtml
  2752. ->getTextContent(
  2753. $title_tag[0]
  2754. );
  2755. $header[0]["innerHTML"] =
  2756. str_replace(
  2757. $title_tag[0]["outerHTML"],
  2758. "",
  2759. $header[0]["innerHTML"]
  2760. );
  2761. // if header still contains text, add it as a subtitle in description
  2762. $subtitle =
  2763. $this->fuckhtml
  2764. ->getTextContent(
  2765. $header[0]
  2766. );
  2767. if(strlen($subtitle) !== 0){
  2768. $description[] = [
  2769. "type" => "quote",
  2770. "value" => $subtitle
  2771. ];
  2772. }
  2773. }
  2774. }
  2775. // reset
  2776. $this->fuckhtml->load($rhs);
  2777. }
  2778. // get description elements
  2779. $url = null;
  2780. $text =
  2781. $this->fuckhtml
  2782. ->getElementsByAttributeValue(
  2783. "data-attrid",
  2784. "description",
  2785. "div"
  2786. );
  2787. if(count($text) !== 0){
  2788. $this->fuckhtml->load($text[0]);
  2789. $a =
  2790. $this->fuckhtml
  2791. ->getElementsByTagName(
  2792. "a"
  2793. );
  2794. if(count($a) !== 0){
  2795. // get link and remove it from description
  2796. $a = $a[count($a) - 1];
  2797. $text[0]["innerHTML"] =
  2798. str_replace(
  2799. $a["outerHTML"],
  2800. "",
  2801. $text[0]["innerHTML"]
  2802. );
  2803. $url =
  2804. $this->fuckhtml
  2805. ->getTextContent(
  2806. $a
  2807. ["attributes"]
  2808. ["href"]
  2809. );
  2810. }
  2811. $description[] = [
  2812. "type" => "text",
  2813. "value" =>
  2814. html_entity_decode(
  2815. preg_replace(
  2816. '/^Description/',
  2817. "",
  2818. $this->fuckhtml
  2819. ->getTextContent(
  2820. $text[0]
  2821. )
  2822. )
  2823. )
  2824. ];
  2825. // reset
  2826. $this->fuckhtml->load($rhs);
  2827. }
  2828. // get reviews (google play, steam, etc)
  2829. $review_container =
  2830. $this->fuckhtml
  2831. ->getElementsByClassName(
  2832. $this->getstyle(
  2833. [
  2834. "align-items" => "start",
  2835. "display" => "flex"
  2836. ]
  2837. ),
  2838. "div"
  2839. );
  2840. if(count($review_container) !== 0){
  2841. $this->fuckhtml->load($review_container[0]);
  2842. $as =
  2843. $this->fuckhtml
  2844. ->getElementsByTagName(
  2845. "a"
  2846. );
  2847. if(count($as) !== 0){
  2848. $description[] = [
  2849. "type" => "title",
  2850. "value" => "Ratings"
  2851. ];
  2852. foreach($as as $a){
  2853. $this->fuckhtml->load($a);
  2854. $spans =
  2855. $this->fuckhtml
  2856. ->getElementsByTagName(
  2857. "span"
  2858. );
  2859. if(count($spans) >= 2){
  2860. $value =
  2861. trim(
  2862. $this->fuckhtml
  2863. ->getTextContent(
  2864. $spans[1]
  2865. ),
  2866. "· "
  2867. );
  2868. if(
  2869. $value == "" &&
  2870. isset($spans[2])
  2871. ){
  2872. $value =
  2873. $this->fuckhtml
  2874. ->getTextContent(
  2875. $spans[2]
  2876. );
  2877. }
  2878. $description[] = [
  2879. "type" => "link",
  2880. "url" =>
  2881. $this->fuckhtml
  2882. ->getTextContent(
  2883. $a["attributes"]
  2884. ["href"]
  2885. ),
  2886. "value" => $value
  2887. ];
  2888. $description[] = [
  2889. "type" => "text",
  2890. "value" =>
  2891. ": " .
  2892. $this->fuckhtml
  2893. ->getTextContent(
  2894. $spans[0]
  2895. ) . "\n"
  2896. ];
  2897. }
  2898. }
  2899. }
  2900. // reset
  2901. $this->fuckhtml->load($rhs);
  2902. }
  2903. // initialize sublinks
  2904. $sublinks = [];
  2905. // get description from business
  2906. if(count($description) === 0){
  2907. $data_attrid =
  2908. $this->fuckhtml
  2909. ->getElementsByAttributeName(
  2910. "data-attrid"
  2911. );
  2912. $summary =
  2913. $this->fuckhtml
  2914. ->getElementsByAttributeValue(
  2915. "data-attrid",
  2916. "kc:/local:one line summary",
  2917. $data_attrid
  2918. );
  2919. if(count($summary) !== 0){
  2920. $description[] = [
  2921. "type" => "quote",
  2922. "value" =>
  2923. $this->fuckhtml
  2924. ->getTextContent(
  2925. $summary[0]
  2926. )
  2927. ];
  2928. // remove summary so it doesnt get parsed as a table
  2929. $rhs["innerHTML"] =
  2930. str_replace(
  2931. $summary[0]["outerHTML"],
  2932. "",
  2933. $rhs["innerHTML"]
  2934. );
  2935. $this->fuckhtml->load($rhs);
  2936. }
  2937. $address =
  2938. $this->fuckhtml
  2939. ->getElementsByAttributeValue(
  2940. "data-attrid",
  2941. "kc:/location/location:address",
  2942. $data_attrid
  2943. );
  2944. if(count($address) !== 0){
  2945. $description[] = [
  2946. "type" => "text",
  2947. "value" =>
  2948. $this->fuckhtml
  2949. ->getTextContent(
  2950. $address[0]
  2951. )
  2952. ];
  2953. }
  2954. // get title
  2955. $title_div =
  2956. $this->fuckhtml
  2957. ->getElementsByAttributeValue(
  2958. "data-attrid",
  2959. "title",
  2960. $data_attrid
  2961. );
  2962. if(count($title_div) !== 0){
  2963. $title =
  2964. $this->fuckhtml
  2965. ->getTextContent(
  2966. $title_div[0]
  2967. );
  2968. }
  2969. // get phone number
  2970. $phone =
  2971. $this->fuckhtml
  2972. ->getElementsByAttributeValue(
  2973. "data-attrid",
  2974. "kc:/local:alt phone",
  2975. $data_attrid
  2976. );
  2977. if(count($phone) !== 0){
  2978. $this->fuckhtml->load($phone[0]);
  2979. $sublinks["Call"] =
  2980. "tel:" .
  2981. $this->fuckhtml
  2982. ->getTextContent(
  2983. $this->fuckhtml
  2984. ->getElementsByAttributeName(
  2985. "aria-label",
  2986. "span"
  2987. )[0]
  2988. );
  2989. $this->fuckhtml->load($rhs);
  2990. }
  2991. }
  2992. if(count($description) === 0){
  2993. // still no description? abort
  2994. return $out;
  2995. }
  2996. // get table elements
  2997. $table = [];
  2998. $table_elems =
  2999. $this->fuckhtml
  3000. ->getElementsByClassName(
  3001. $this->getstyle(
  3002. [
  3003. "margin-top" => "7px"
  3004. ]
  3005. ),
  3006. "div"
  3007. );
  3008. foreach($table_elems as $elem){
  3009. $this->fuckhtml->load($elem);
  3010. $spans =
  3011. $this->fuckhtml
  3012. ->getElementsByTagName(
  3013. "span"
  3014. );
  3015. if(count($spans) === 0){
  3016. // ?? invalid
  3017. continue;
  3018. }
  3019. $elem["innerHTML"] =
  3020. str_replace(
  3021. $spans[0]["outerHTML"],
  3022. "",
  3023. $elem["innerHTML"]
  3024. );
  3025. $key =
  3026. rtrim(
  3027. $this->fuckhtml
  3028. ->getTextContent(
  3029. $spans[0]
  3030. ),
  3031. ": "
  3032. );
  3033. if(
  3034. $key == "" ||
  3035. $key == "Phone"
  3036. ){
  3037. continue;
  3038. }
  3039. if($key == "Hours"){
  3040. $hours = [];
  3041. $this->fuckhtml->load($elem);
  3042. $trs =
  3043. $this->fuckhtml
  3044. ->getElementsByTagName(
  3045. "tr"
  3046. );
  3047. foreach($trs as $tr){
  3048. $this->fuckhtml->load($tr);
  3049. $tds =
  3050. $this->fuckhtml
  3051. ->getElementsByTagName(
  3052. "td"
  3053. );
  3054. if(count($tds) === 2){
  3055. $hours[] =
  3056. $this->fuckhtml
  3057. ->getTextContent(
  3058. $tds[0]
  3059. ) . ": " .
  3060. $this->fuckhtml
  3061. ->getTextContent(
  3062. $tds[1]
  3063. );
  3064. }
  3065. }
  3066. if(count($hours) !== 0){
  3067. $hours = implode("\n", $hours);
  3068. $table["Hours"] = $hours;
  3069. }
  3070. continue;
  3071. }
  3072. $table[$key] =
  3073. preg_replace(
  3074. '/ +/',
  3075. " ",
  3076. $this->fuckhtml
  3077. ->getTextContent(
  3078. $elem
  3079. )
  3080. );
  3081. }
  3082. // reset
  3083. $this->fuckhtml->load($rhs);
  3084. // get the website div
  3085. $as =
  3086. $this->fuckhtml
  3087. ->getElementsByAttributeValue(
  3088. "data-attrid",
  3089. "visit_official_site",
  3090. "a"
  3091. );
  3092. if(count($as) !== 0){
  3093. $sublinks["Website"] =
  3094. str_replace(
  3095. "http://",
  3096. "https://",
  3097. $this->fuckhtml
  3098. ->getTextContent(
  3099. $as[0]
  3100. ["attributes"]
  3101. ["href"]
  3102. )
  3103. );
  3104. }else{
  3105. // get website through button
  3106. $button =
  3107. $this->fuckhtml
  3108. ->getElementsByClassName(
  3109. "ab_button",
  3110. "a"
  3111. );
  3112. if(count($button) !== 0){
  3113. $sublinks["Website"] =
  3114. $this->unshiturl(
  3115. $this->fuckhtml
  3116. ->getTextContent(
  3117. $button[0]
  3118. ["attributes"]
  3119. ["href"]
  3120. )
  3121. );
  3122. }
  3123. }
  3124. // get social media links
  3125. $as =
  3126. $this->fuckhtml
  3127. ->getElementsByTagName(
  3128. "g-link"
  3129. );
  3130. foreach($as as $a){
  3131. $this->fuckhtml->load($a);
  3132. $link =
  3133. $this->fuckhtml
  3134. ->getElementsByTagName(
  3135. "a"
  3136. );
  3137. if(count($link) === 0){
  3138. continue;
  3139. }
  3140. $sublink_title =
  3141. $this->fuckhtml
  3142. ->getTextContent(
  3143. $a
  3144. );
  3145. if($sublink_title == "X (Twitter)"){
  3146. $sublink_title = "Twitter";
  3147. }
  3148. $sublinks[$sublink_title] =
  3149. $this->fuckhtml
  3150. ->getTextContent(
  3151. $link[0]
  3152. ["attributes"]
  3153. ["href"]
  3154. );
  3155. }
  3156. // reset
  3157. $this->fuckhtml->load($rhs);
  3158. // get those round containers
  3159. $containers =
  3160. $this->fuckhtml
  3161. ->getElementsByClassName(
  3162. "tpa-ci"
  3163. );
  3164. foreach($containers as $container){
  3165. $this->fuckhtml->load($container);
  3166. $as =
  3167. $this->fuckhtml
  3168. ->getElementsByTagName(
  3169. "a"
  3170. );
  3171. if(count($as) === 0){
  3172. continue;
  3173. }
  3174. $sublinks[
  3175. $this->fuckhtml
  3176. ->getTextContent(
  3177. $as[0]
  3178. )
  3179. ] =
  3180. $this->fuckhtml
  3181. ->getTextContent(
  3182. $as[0]
  3183. ["attributes"]
  3184. ["href"]
  3185. );
  3186. }
  3187. $out["answer"][] = [
  3188. "title" => $title,
  3189. "description" => $description,
  3190. "url" => $url,
  3191. "thumb" => null,
  3192. "table" => $table,
  3193. "sublink" => $sublinks
  3194. ];
  3195. return $out;
  3196. }
  3197. private function scrape_dimg($html){
  3198. // get images loaded through javascript
  3199. $this->dimg = [];
  3200. preg_match_all(
  3201. '/function\(\){google\.ldi=({.*?});/',
  3202. $html,
  3203. $dimg
  3204. );
  3205. if(isset($dimg[1])){
  3206. foreach($dimg[1] as $i){
  3207. $tmp = json_decode($i, true);
  3208. foreach($tmp as $key => $value){
  3209. $this->dimg[$key] =
  3210. $this->unshit_thumb(
  3211. $value
  3212. );
  3213. }
  3214. }
  3215. }
  3216. // get additional javascript base64 images
  3217. preg_match_all(
  3218. '/var s=\'(data:image\/[^\']+)\';var ii=\[((?:\'[^\']+\',?)+)\];/',
  3219. $html,
  3220. $dimg
  3221. );
  3222. if(isset($dimg[1])){
  3223. for($i=0; $i<count($dimg[1]); $i++){
  3224. $delims = explode(",", $dimg[2][$i]);
  3225. $string =
  3226. $this->fuckhtml
  3227. ->parseJsString(
  3228. $dimg[1][$i]
  3229. );
  3230. foreach($delims as $delim){
  3231. $this->dimg[trim($delim, "'")] = $string;
  3232. }
  3233. }
  3234. }
  3235. }
  3236. private function scrape_imagearr($html){
  3237. // get image links arrays
  3238. preg_match_all(
  3239. '/\[0,"([^"]+)",\["([^"]+)\",([0-9]+),([0-9]+)\],\["([^"]+)",([0-9]+),([0-9]+)\]/',
  3240. $html,
  3241. $image_arr
  3242. );
  3243. $this->image_arr = [];
  3244. if(isset($image_arr[1])){
  3245. for($i=0; $i<count($image_arr[1]); $i++){
  3246. $this->image_arr[$image_arr[1][$i]] =
  3247. [
  3248. [
  3249. "url" =>
  3250. $this->fuckhtml
  3251. ->parseJsString(
  3252. $image_arr[5][$i]
  3253. ),
  3254. "width" => (int)$image_arr[7][$i],
  3255. "height" => (int)$image_arr[6][$i]
  3256. ],
  3257. [
  3258. "url" =>
  3259. $this->unshit_thumb(
  3260. $this->fuckhtml
  3261. ->parseJsString(
  3262. $image_arr[2][$i]
  3263. )
  3264. ),
  3265. "width" => (int)$image_arr[4][$i],
  3266. "height" => (int)$image_arr[3][$i]
  3267. ]
  3268. ];
  3269. }
  3270. }
  3271. }
  3272. private function getdimg($dimg){
  3273. return isset($this->dimg[$dimg]) ? $this->dimg[$dimg] : null;
  3274. }
  3275. private function unshit_thumb($url){
  3276. //
  3277. //
  3278. $parts = parse_url($url);
  3279. if(
  3280. isset($parts["host"]) &&
  3281. preg_match(
  3282. '/tbn.*\.gstatic\.com/',
  3283. $parts["host"]
  3284. )
  3285. ){
  3286. parse_str($parts["query"], $params);
  3287. if(isset($params["q"])){
  3288. return "https://" . $parts["host"] . "/images?q=" . $params["q"];
  3289. }
  3290. }
  3291. return $url;
  3292. }
  3293. private function parsestyles(){
  3294. $styles = [];
  3295. $style_div =
  3296. $this->fuckhtml
  3297. ->getElementsByTagName(
  3298. "style"
  3299. );
  3300. $raw_styles = "";
  3301. foreach($style_div as $style){
  3302. $raw_styles .= $style["innerHTML"];
  3303. }
  3304. // filter out media/keyframe queries
  3305. $raw_styles =
  3306. preg_replace(
  3307. '/@\s*(?!font-face)[^{]+\s*{[\S\s]+?}\s*}/',
  3308. "",
  3309. $raw_styles
  3310. );
  3311. // get styles
  3312. preg_match_all(
  3313. '/(.+?){([\S\s]*?)}/',
  3314. $raw_styles,
  3315. $matches
  3316. );
  3317. for($i=0; $i<count($matches[1]); $i++){
  3318. // get style values
  3319. preg_match_all(
  3320. '/([^:;]+):([^;]*?(?:\([^)]+\)[^;]*?)?)(?:;|$)/',
  3321. $matches[2][$i],
  3322. $values_regex
  3323. );
  3324. $values = [];
  3325. for($k=0; $k<count($values_regex[1]); $k++){
  3326. $values[trim($values_regex[1][$k])] =
  3327. strtolower(trim($values_regex[2][$k]));
  3328. }
  3329. $names = explode(",", $matches[1][$i]);
  3330. // h1,h2,h3 will each get their own array index
  3331. foreach($names as $name){
  3332. $name = trim($name, "}\t\n\r\0\x0B");
  3333. foreach($values as $key => $value){
  3334. $styles[$name][$key] = $value;
  3335. }
  3336. }
  3337. }
  3338. foreach($styles as $key => $values){
  3339. $styles[$key]["_c"] = count($values);
  3340. }
  3341. $this->styles = $styles;
  3342. // get CSS colors
  3343. $this->css_colors = [];
  3344. if(isset($this->styles[":root"])){
  3345. foreach($this->styles[":root"] as $key => $value){
  3346. $this->css_colors[$value] = strtolower($key);
  3347. }
  3348. }
  3349. }
  3350. private function getstyle($styles){
  3351. $styles["_c"] = count($styles);
  3352. foreach($this->styles as $style_key => $style_values){
  3353. if(count(array_intersect_assoc($style_values, $styles)) === $styles["_c"] + 1){
  3354. $style_key =
  3355. explode(" ", $style_key);
  3356. $style_key = $style_key[count($style_key) - 1];
  3357. return
  3358. ltrim(
  3359. str_replace(
  3360. [".", "#"],
  3361. " ",
  3362. $style_key
  3363. )
  3364. );
  3365. }
  3366. }
  3367. return false;
  3368. }
  3369. private function getcolorvar($color){
  3370. if(isset($this->css_colors[$color])){
  3371. return $this->css_colors[$color];
  3372. }
  3373. return null;
  3374. }
  3375. public function web($get){
  3376. if($get["npt"]){
  3377. [$params, $proxy] = $this->backend->get($get["npt"], "web");
  3378. $params = json_decode($params, true);
  3379. $search = $params["q"];
  3380. }else{
  3381. $search = $get["s"];
  3382. $country = $get["country"];
  3383. $nsfw = $get["nsfw"];
  3384. $lang = $get["lang"];
  3385. $older = $get["older"];
  3386. $newer = $get["newer"];
  3387. $spellcheck = $get["spellcheck"];
  3388. $proxy = $this->backend->get_ip();
  3389. $offset = 0;
  3390. $params = [
  3391. "q" => $search,
  3392. "hl" => "en",
  3393. "num" => 20 // get 20 results
  3394. ];
  3395. // country
  3396. if($country != "any"){
  3397. $params["gl"] = $country;
  3398. }
  3399. // nsfw
  3400. $params["safe"] = $nsfw == "yes" ? "off" : "active";
  3401. // language
  3402. if($lang != "any"){
  3403. $params["lr"] = "lang_" . $lang;
  3404. }
  3405. // generate tbs
  3406. $tbs = [];
  3407. // get date
  3408. $older = $older === false ? null : date("m/d/Y", $older);
  3409. $newer = $newer === false ? null : date("m/d/Y", $newer);
  3410. if(
  3411. $older !== null ||
  3412. $newer !== null
  3413. ){
  3414. $tbs["cdr"] = "1";
  3415. $tbs["cd_min"] = $newer;
  3416. $tbs["cd_max"] = $older;
  3417. }
  3418. // spellcheck filter
  3419. if($spellcheck == "no"){
  3420. $params["nfpr"] = "1";
  3421. }
  3422. if(count($tbs) !== 0){
  3423. $params["tbs"] = "";
  3424. foreach($tbs as $key => $value){
  3425. $params["tbs"] .= $key . ":" . $value . ",";
  3426. }
  3427. $params["tbs"] = rtrim($params["tbs"], ",");
  3428. }
  3429. }
  3430. try{
  3431. $html =
  3432. $this->get(
  3433. $proxy,
  3434. "",
  3435. $params
  3436. );
  3437. }catch(Exception $error){
  3438. throw new Exception("Failed to get HTML");
  3439. }
  3440. //$html = file_get_contents("scraper/google.html");
  3441. return $this->parsepage($html, "web", $search, $proxy, $params);
  3442. }
  3443. public function video($get){
  3444. if($get["npt"]){
  3445. [$params, $proxy] = $this->backend->get($get["npt"], "video");
  3446. $params = json_decode($params, true);
  3447. $search = $params["q"];
  3448. }else{
  3449. $search = $get["s"];
  3450. $country = $get["country"];
  3451. $nsfw = $get["nsfw"];
  3452. $older = $get["older"];
  3453. $newer = $get["newer"];
  3454. $duration = $get["duration"];
  3455. $quality = $get["quality"];
  3456. $captions = $get["captions"];
  3457. $proxy = $this->backend->get_ip();
  3458. $params = [
  3459. "q" => $search,
  3460. "tbm" => "vid",
  3461. "hl" => "en",
  3462. "num" => "20"
  3463. ];
  3464. // country
  3465. if($country != "any"){
  3466. $params["gl"] = $country;
  3467. }
  3468. // nsfw
  3469. $params["safe"] = $nsfw == "yes" ? "off" : "active";
  3470. $tbs = [];
  3471. // get date
  3472. $older = $older === false ? null : date("m/d/Y", $older);
  3473. $newer = $newer === false ? null : date("m/d/Y", $newer);
  3474. if(
  3475. $older !== null ||
  3476. $newer !== null
  3477. ){
  3478. $tbs["cdr"] = "1";
  3479. $tbs["cd_min"] = $newer;
  3480. $tbs["cd_max"] = $older;
  3481. }
  3482. // duration
  3483. if($duration != "any"){
  3484. $tbs[] = "dur:" . $duration;
  3485. }
  3486. // quality
  3487. if($quality != "any"){
  3488. $tbs[] = "hq:" . $quality;
  3489. }
  3490. // captions
  3491. if($captions != "any"){
  3492. $tbs[] = "cc:" . $captions;
  3493. }
  3494. // append tbs
  3495. if(count($tbs) !== 0){
  3496. $params["tbs"] =
  3497. implode(",", $tbs);
  3498. }
  3499. }
  3500. try{
  3501. $html =
  3502. $this->get(
  3503. $proxy,
  3504. "",
  3505. $params
  3506. );
  3507. }catch(Exception $error){
  3508. throw new Exception("Failed to get HTML");
  3509. }
  3510. //$html = file_get_contents("scraper/google.html");
  3511. $response = $this->parsepage($html, "videos", $search, $proxy, $params);
  3512. $out = [
  3513. "status" => "ok",
  3514. "npt" => $response["npt"],
  3515. "video" => [],
  3516. "author" => [],
  3517. "livestream" => [],
  3518. "playlist" => [],
  3519. "reel" => []
  3520. ];
  3521. foreach($response["web"] as $result){
  3522. $out["video"][] = [
  3523. "title" => $result["title"],
  3524. "description" => $result["description"],
  3525. "author" => [
  3526. "name" => isset($result["table"]["Author"]) ? $result["table"]["Author"] : null,
  3527. "url" => null,
  3528. "avatar" => null
  3529. ],
  3530. "date" => $result["date"],
  3531. "duration" => isset($result["table"]["Duration"]) ? $this->hms2int($result["table"]["Duration"]) : null,
  3532. "views" => null,
  3533. "thumb" => $result["thumb"],
  3534. "url" => $result["url"]
  3535. ];
  3536. }
  3537. return $out;
  3538. }
  3539. public function news($get){
  3540. if($get["npt"]){
  3541. [$req, $proxy] = $this->backend->get($get["npt"], "news");
  3542. /*parse_str(
  3543. parse_url($req, PHP_URL_QUERY),
  3544. $search
  3545. );*/
  3546. try{
  3547. $html =
  3548. $this->get(
  3549. $proxy,
  3550. "" . $req,
  3551. []
  3552. );
  3553. }catch(Exception $error){
  3554. throw new Exception("Failed to get HTML");
  3555. }
  3556. }else{
  3557. $search = $get["s"];
  3558. $country = $get["country"];
  3559. $nsfw = $get["nsfw"];
  3560. $older = $get["older"];
  3561. $newer = $get["newer"];
  3562. $sort = $get["sort"];
  3563. $proxy = $this->backend->get_ip();
  3564. $params = [
  3565. "q" => $search,
  3566. "tbm" => "nws",
  3567. "hl" => "en",
  3568. "num" => "20"
  3569. ];
  3570. // country
  3571. if($country != "any"){
  3572. $params["gl"] = $country;
  3573. }
  3574. // nsfw
  3575. $params["safe"] = $nsfw == "yes" ? "off" : "active";
  3576. $tbs = [];
  3577. // get date
  3578. $older = $older === false ? null : date("m/d/Y", $older);
  3579. $newer = $newer === false ? null : date("m/d/Y", $newer);
  3580. if(
  3581. $older !== null ||
  3582. $newer !== null
  3583. ){
  3584. $tbs["cdr"] = "1";
  3585. $tbs["cd_min"] = $newer;
  3586. $tbs["cd_max"] = $older;
  3587. }
  3588. // relevance
  3589. if($sort == "date"){
  3590. $tbs["sbd"] = "1";
  3591. }
  3592. // append tbs
  3593. if(count($tbs) !== 0){
  3594. $params["tbs"] = "";
  3595. foreach($tbs as $key => $value){
  3596. $params["tbs"] .= $key . ":" . $value . ",";
  3597. }
  3598. $params["tbs"] = rtrim($params["tbs"], ",");
  3599. }
  3600. //$html = file_get_contents("scraper/google-news.html");
  3601. $html =
  3602. $this->get(
  3603. $proxy,
  3604. "",
  3605. $params
  3606. );
  3607. }
  3608. $out = [
  3609. "status" => "ok",
  3610. "npt" => null,
  3611. "news" => []
  3612. ];
  3613. $this->fuckhtml->load($html);
  3614. $this->detect_sorry();
  3615. // get images
  3616. $this->scrape_dimg($html);
  3617. // parse styles
  3618. $this->parsestyles();
  3619. $center_col =
  3620. $this->fuckhtml
  3621. ->getElementById(
  3622. "center_col",
  3623. "div"
  3624. );
  3625. if($center_col === null){
  3626. throw new Exception("Could not grep result div");
  3627. }
  3628. $this->fuckhtml->load($center_col);
  3629. // get next page
  3630. $npt =
  3631. $this->fuckhtml
  3632. ->getElementById(
  3633. "pnnext",
  3634. "a"
  3635. );
  3636. if($npt !== false){
  3637. $out["npt"] =
  3638. $this->backend->store(
  3639. $this->fuckhtml
  3640. ->getTextContent(
  3641. $npt["attributes"]
  3642. ["href"]
  3643. ),
  3644. "news",
  3645. $proxy
  3646. );
  3647. }
  3648. $as =
  3649. $this->fuckhtml
  3650. ->getElementsByAttributeName(
  3651. "jsname",
  3652. "a"
  3653. );
  3654. foreach($as as $a){
  3655. $this->fuckhtml->load($a);
  3656. // get title
  3657. $title =
  3658. $this->fuckhtml
  3659. ->getElementsByAttributeValue(
  3660. "role",
  3661. "heading",
  3662. "div"
  3663. );
  3664. if(count($title) === 0){
  3665. continue;
  3666. }
  3667. $title =
  3668. $this->titledots(
  3669. $this->fuckhtml
  3670. ->getTextContent(
  3671. $title[0]
  3672. )
  3673. );
  3674. // get thumbnail
  3675. $image =
  3676. $this->fuckhtml
  3677. ->getElementsByAttributeName(
  3678. "id",
  3679. "img"
  3680. );
  3681. // check for padded title node, if found, we're inside a carousel
  3682. $probe =
  3683. $this->fuckhtml
  3684. ->getElementsByClassName(
  3685. $this->getstyle(
  3686. [
  3687. "padding" => "16px 16px 40px 16px"
  3688. ]
  3689. ),
  3690. "div"
  3691. );
  3692. if(count($probe) !== 0){
  3693. $probe = true;
  3694. }else{
  3695. $probe = false;
  3696. }
  3697. if(
  3698. count($image) !== 0 &&
  3699. !isset($image[0]["attributes"]["width"])
  3700. ){
  3701. $thumb = [
  3702. "url" =>
  3703. $this->getdimg(
  3704. $image[0]["attributes"]["id"]
  3705. ),
  3706. "ratio" => $probe === true ? "16:9" : "1:1"
  3707. ];
  3708. }else{
  3709. $thumb = [
  3710. "url" => null,
  3711. "ratio" => null
  3712. ];
  3713. }
  3714. $description = null;
  3715. if($probe === false){
  3716. $desc_divs =
  3717. $this->fuckhtml
  3718. ->getElementsByAttributeName(
  3719. "style",
  3720. "div"
  3721. );
  3722. foreach($desc_divs as $desc){
  3723. if(
  3724. strpos(
  3725. $desc["attributes"]["style"],
  3726. "margin-top:"
  3727. ) !== false
  3728. ){
  3729. $description =
  3730. $this->titledots(
  3731. $this->fuckhtml
  3732. ->getTextContent(
  3733. $desc
  3734. )
  3735. );
  3736. break;
  3737. }
  3738. }
  3739. }
  3740. // get author
  3741. $author =
  3742. $this->fuckhtml
  3743. ->getElementsByClassName(
  3744. $this->getstyle(
  3745. [
  3746. "overflow" => "hidden",
  3747. "text-align" => "left",
  3748. "text-overflow" => "ellipsis",
  3749. "white-space" => "nowrap",
  3750. "margin-bottom" => "8px"
  3751. ]
  3752. ),
  3753. "div"
  3754. );
  3755. if(count($author) !== 0){
  3756. $author =
  3757. $this->fuckhtml
  3758. ->getTextContent(
  3759. $author[0]
  3760. );
  3761. }else{
  3762. $author = null;
  3763. }
  3764. // get date
  3765. $date = null;
  3766. $date_div =
  3767. $this->fuckhtml
  3768. ->getElementsByAttributeName(
  3769. "style",
  3770. "div"
  3771. );
  3772. foreach($date_div as $d){
  3773. $this->fuckhtml->load($d);
  3774. $span =
  3775. $this->fuckhtml
  3776. ->getElementsByTagName(
  3777. "span"
  3778. );
  3779. if(
  3780. strpos(
  3781. $d["attributes"]["style"],
  3782. "bottom:"
  3783. ) !== false
  3784. ){
  3785. $date =
  3786. strtotime(
  3787. $this->fuckhtml
  3788. ->getTextContent(
  3789. $span[count($span) - 1]
  3790. )
  3791. );
  3792. break;
  3793. }
  3794. }
  3795. $out["news"][] = [
  3796. "title" => $title,
  3797. "author" => $author,
  3798. "description" => $description,
  3799. "date" => $date,
  3800. "thumb" => $thumb,
  3801. "url" =>
  3802. $this->unshiturl(
  3803. $a["attributes"]
  3804. ["href"]
  3805. )
  3806. ];
  3807. }
  3808. return $out;
  3809. }
  3810. public function image($get){
  3811. // generate parameters
  3812. if($get["npt"]){
  3813. [$params, $proxy] =
  3814. $this->backend->get(
  3815. $get["npt"],
  3816. "images"
  3817. );
  3818. $params = json_decode($params, true);
  3819. }else{
  3820. $search = $get["s"];
  3821. if(strlen($search) === 0){
  3822. throw new Exception("Search term is empty!");
  3823. }
  3824. $proxy = $this->backend->get_ip();
  3825. $country = $get["country"];
  3826. $nsfw = $get["nsfw"];
  3827. $time = $get["time"];
  3828. $size = $get["size"];
  3829. $ratio = $get["ratio"];
  3830. $color = $get["color"];
  3831. $type = $get["type"];
  3832. $format = $get["format"];
  3833. $rights = $get["rights"];
  3834. $params = [
  3835. "q" => $search,
  3836. "udm" => "2" // get images
  3837. ];
  3838. // country (image search uses cr instead of gl)
  3839. if($country != "any"){
  3840. $params["cr"] = "country" . strtoupper($country);
  3841. }
  3842. // nsfw
  3843. $params["safe"] = $nsfw == "yes" ? "off" : "active";
  3844. // generate tbs
  3845. $tbs = [];
  3846. // time
  3847. if($time != "any"){
  3848. $tbs["qdr"] = $time;
  3849. }
  3850. // size
  3851. if($size != "any"){
  3852. $params["imgsz"] = $size;
  3853. }
  3854. // ratio
  3855. if($ratio != "any"){
  3856. $params["imgar"] = $ratio;
  3857. }
  3858. // color
  3859. if($color != "any"){
  3860. if(
  3861. $color == "color" ||
  3862. $color == "trans"
  3863. ){
  3864. $params["imgc"] = $color;
  3865. }elseif($color == "bnw"){
  3866. $params["imgc"] = "gray";
  3867. }else{
  3868. $tbs["ic"] = "specific";
  3869. $tbs["isc"] = $color;
  3870. }
  3871. }
  3872. // type
  3873. if($type != "any"){
  3874. $tbs["itp"] = $type;
  3875. }
  3876. // format
  3877. if($format != "any"){
  3878. $params["as_filetype"] = $format;
  3879. }
  3880. // rights (tbs)
  3881. if($rights != "any"){
  3882. $tbs["sur"] = $rights;
  3883. }
  3884. // append tbs
  3885. if(count($tbs) !== 0){
  3886. $params["tbs"] = "";
  3887. foreach($tbs as $key => $value){
  3888. $params["tbs"] .= $key . ":" . $value . ",";
  3889. }
  3890. $params["tbs"] = rtrim($params["tbs"], ",");
  3891. }
  3892. }
  3893. /*
  3894. $handle = fopen("scraper/google-img.html", "r");
  3895. $html = fread($handle, filesize("scraper/google-img.html"));
  3896. fclose($handle);*/
  3897. try{
  3898. $html =
  3899. $this->get(
  3900. $proxy,
  3901. "",
  3902. $params
  3903. );
  3904. }catch(Exception $error){
  3905. throw new Exception("Failed to get search page");
  3906. }
  3907. $this->fuckhtml->load($html);
  3908. $this->detect_sorry();
  3909. // get javascript images
  3910. $this->scrape_imagearr($html);
  3911. $out = [
  3912. "status" => "ok",
  3913. "npt" => null,
  3914. "image" => []
  3915. ];
  3916. $images =
  3917. $this->fuckhtml
  3918. ->getElementsByClassName(
  3919. "ivg-i",
  3920. "div"
  3921. );
  3922. foreach($images as $div){
  3923. $this->fuckhtml->load($div);
  3924. $image =
  3925. $this->fuckhtml
  3926. ->getElementsByTagName("img")[0];
  3927. $out["image"][] = [
  3928. "title" =>
  3929. $this->titledots(
  3930. $this->fuckhtml
  3931. ->getTextContent(
  3932. $image["attributes"]["alt"]
  3933. )
  3934. ),
  3935. "source" =>
  3936. $this->image_arr[
  3937. $div["attributes"]["data-docid"]
  3938. ],
  3939. "url" =>
  3940. $this->fuckhtml
  3941. ->getTextContent(
  3942. $div["attributes"]["data-lpage"]
  3943. )
  3944. ];
  3945. }
  3946. // as usual, no way to check if there is a next page reliably
  3947. if(count($out["image"]) > 50){
  3948. if(!isset($params["start"])){
  3949. $params["start"] = 10;
  3950. }else{
  3951. $params["start"] += 10;
  3952. }
  3953. $out["npt"] =
  3954. $this->backend
  3955. ->store(
  3956. json_encode($params),
  3957. "image",
  3958. $proxy
  3959. );
  3960. }
  3961. return $out;
  3962. }
  3963. private function unshiturl($url, $return_size = false){
  3964. // decode
  3965. $url =
  3966. $this->fuckhtml
  3967. ->getTextContent($url);
  3968. $url_parts = parse_url($url);
  3969. if(
  3970. !isset(
  3971. $url_parts["host"]
  3972. )
  3973. ){
  3974. // no host, we have a tracking url
  3975. parse_str($url_parts["query"], $query);
  3976. if(isset($query["imgurl"])){
  3977. $url = $query["imgurl"];
  3978. }
  3979. elseif(isset($query["q"])){
  3980. $url = $query["q"];
  3981. }
  3982. }
  3983. // rewrite URLs to remove extra tracking parameters
  3984. $domain = parse_url($url, PHP_URL_HOST);
  3985. if(
  3986. preg_match(
  3987. '/$/',
  3988. $domain
  3989. )
  3990. ){
  3991. // rewrite wikipedia mobile URLs to desktop
  3992. $url =
  3993. $this->replacedomain(
  3994. $url,
  3995. preg_replace(
  3996. '/([a-z0-9]+)(\.m\.)/',
  3997. '$1.',
  3998. $domain
  3999. )
  4000. );
  4001. }
  4002. elseif(
  4003. preg_match(
  4004. '/imdb\.com$|youtube\.[^.]+$/',
  4005. $domain
  4006. )
  4007. ){
  4008. // rewrite imdb and youtube mobile URLs too
  4009. $url =
  4010. $this->replacedomain(
  4011. $url,
  4012. preg_replace(
  4013. '/^m\./',
  4014. "",
  4015. $domain
  4016. )
  4017. );
  4018. }
  4019. elseif(
  4020. preg_match(
  4021. '/play\.google\.[^.]+$/',
  4022. $domain
  4023. )
  4024. ){
  4025. // remove referrers from
  4026. $oldquery = parse_url($url, PHP_URL_QUERY);
  4027. if($oldquery !== null){
  4028. parse_str($oldquery, $query);
  4029. if(isset($query["referrer"])){ unset($query["referrer"]); }
  4030. if(isset($query["hl"])){ unset($query["hl"]); }
  4031. if(isset($query["gl"])){ unset($query["gl"]); }
  4032. $query = http_build_query($query);
  4033. $url =
  4034. str_replace(
  4035. $oldquery,
  4036. $query,
  4037. $url
  4038. );
  4039. }
  4040. }
  4041. elseif(
  4042. preg_match(
  4043. '/twitter\.com$/',
  4044. $domain
  4045. )
  4046. ){
  4047. // remove more referrers from
  4048. $oldquery = parse_url($url, PHP_URL_QUERY);
  4049. if($oldquery !== null){
  4050. parse_str($oldquery, $query);
  4051. if(isset($query["ref_src"])){ unset($query["ref_src"]); }
  4052. $query = http_build_query($query);
  4053. $url =
  4054. str_replace(
  4055. $oldquery,
  4056. $query,
  4057. $url
  4058. );
  4059. }
  4060. }
  4061. elseif(
  4062. preg_match(
  4063. '/maps\.google\.[^.]+/',
  4064. $domain
  4065. )
  4066. ){
  4067. if(stripos($url, "maps?") !== false){
  4068. //,+603+Rue+St+Georges,+Saint-J%C3%A9r%C3%B4me,+Quebec+J7Z+5B7
  4069. $query = parse_url($url, PHP_URL_QUERY);
  4070. if($query !== null){
  4071. parse_str($query, $query);
  4072. if(isset($query["daddr"])){
  4073. $url =
  4074. "" .
  4075. urlencode($query["daddr"]);
  4076. }
  4077. }
  4078. }
  4079. }
  4080. if($return_size){
  4081. return [
  4082. "url" => $url,
  4083. "ref" => isset($query["imgrefurl"]) ? $query["imgrefurl"] : null,
  4084. "thumb_width" => isset($query["tbnw"]) ? (int)$query["tbnw"] : null,
  4085. "thumb_height" => isset($query["tbnh"]) ? (int)$query["tbnh"] : null,
  4086. "image_width" => isset($query["w"]) ? (int)$query["w"] : null,
  4087. "image_height" => isset($query["h"]) ? (int)$query["h"] : null
  4088. ];
  4089. }
  4090. return $url;
  4091. }
  4092. private function replacedomain($url, $domain){
  4093. return
  4094. preg_replace(
  4095. '/(https?:\/\/)([^\/]+)/',
  4096. '$1' . $domain,
  4097. $url
  4098. );
  4099. }
  4100. private function titledots($title){
  4101. return trim($title, " .\t\n\r\0\x0B…");
  4102. }
  4103. private function hms2int($time){
  4104. $parts = explode(":", $time, 3);
  4105. $time = 0;
  4106. if(count($parts) === 3){
  4107. // hours
  4108. $time = $time + ((int)$parts[0] * 3600);
  4109. array_shift($parts);
  4110. }
  4111. if(count($parts) === 2){
  4112. // minutes
  4113. $time = $time + ((int)$parts[0] * 60);
  4114. array_shift($parts);
  4115. }
  4116. // seconds
  4117. $time = $time + (int)$parts[0];
  4118. return $time;
  4119. }
  4120. private function detect_sorry(){
  4121. $recaptcha =
  4122. $this->fuckhtml
  4123. ->getElementById(
  4124. "recaptcha",
  4125. "div"
  4126. );
  4127. if($recaptcha !== false){
  4128. throw new Exception("Google returned a captcha");
  4129. }
  4130. }
  4131. }