startpage.php 34 KB


  1. <?php
  2. class startpage{
  3. public function __construct(){
  4. include "lib/backend.php";
  5. $this->backend = new backend("startpage");
  6. include "lib/fuckhtml.php";
  7. $this->fuckhtml = new fuckhtml();
  8. }
  9. public function getfilters($page){
  10. switch($page){
  11. case "web":
  12. return [
  13. "country" => [
  14. "display" => "Country",
  15. "option" => [
  16. "any" => "All Regions",
  17. "es_AR" => "Argentina",
  18. "en_AU" => "Australia",
  19. "de_AT" => "Austria",
  20. "ru_BY" => "Belarus",
  21. "fr_BE" => "Belgium (FR)",
  22. "nl_BE" => "Belgium (NL)",
  23. "bg_BG" => "Bulgaria",
  24. "en_CA" => "Canada (EN)",
  25. "fr_CA" => "Canada (FR)",
  26. "es_CL" => "Chile",
  27. "es_CO" => "Colombia",
  28. "cs_CZ" => "Czech Republic",
  29. "da_DK" => "Denmark",
  30. "ar_EG" => "Egypt",
  31. "et_EE" => "Estonia",
  32. "fi_FI" => "Finland",
  33. "fr_FR" => "France",
  34. "de_DE" => "Germany",
  35. "el_GR" => "Greece",
  36. "hu_HU" => "Hungary",
  37. "hi_IN" => "India (HI)",
  38. "en_IN" => "India (EN)",
  39. "id_ID" => "Indonesia (ID)",
  40. "en_ID" => "Indonesia (EN)",
  41. "en_IE" => "Ireland",
  42. "it_IT" => "Italy",
  43. "ja_JP" => "Japan",
  44. "ko_KR" => "Korea",
  45. "ms_MY" => "Malaysia (MS)",
  46. "en_MY" => "Malaysia (EN)",
  47. "es_MX" => "Mexico",
  48. "nl_NL" => "Netherlands",
  49. "en_NZ" => "New Zealand",
  50. "no_NO" => "Norway",
  51. "es_PE" => "Peru",
  52. "fil_PH" => "Philippines (FIL)",
  53. "en_PH" => "Philippines (EN)",
  54. "pl_PL" => "Poland",
  55. "pt_PT" => "Portugal",
  56. "ro_RO" => "Romania",
  57. "ru_RU" => "Russia",
  58. "ms_SG" => "Singapore (MS)",
  59. "en_SG" => "Singapore (EN)",
  60. "es_ES" => "Spain (ES)",
  61. "ca_ES" => "Spain (CA)",
  62. "sv_SE" => "Sweden",
  63. "de_CH" => "Switzerland (DE)",
  64. "fr_CH" => "Switzerland (FR)",
  65. "it_CH" => "Switzerland (IT)",
  66. "tr_TR" => "Turkey",
  67. "uk_UA" => "Ukraine",
  68. "en_US" => "US (EN)",
  69. "es_US" => "US (ES)",
  70. "es_UY" => "Uruguay",
  71. "es_VE" => "Venezuela",
  72. "vi_VN" => "Vietnam (VI)",
  73. "en_VN" => "Vietnam (EN)",
  74. "en_ZA" => "South Africa"
  75. ]
  76. ],
  77. "nsfw" => [ // qadf
  78. "display" => "NSFW",
  79. "option" => [
  80. "yes" => "Yes", // qadf=none
  81. "no" => "No" // qadf=heavy
  82. ]
  83. ],
  84. "time" => [ // with_date
  85. "display" => "Time posted",
  86. "option" => [
  87. "any" => "Any time",
  88. "d" => "Past 24 hours",
  89. "w" => "Past week",
  90. "m" => "Past month",
  91. "y" => "Past year",
  92. ]
  93. ],
  94. "extendedsearch" => [
  95. // undefined display, so it wont show in frontend
  96. "option" => [
  97. "yes" => "Yes",
  98. "no" => "No"
  99. ]
  100. ]
  101. ];
  102. break;
  103. case "images":
  104. return [
  105. "nsfw" => [ // qadf
  106. "display" => "NSFW",
  107. "option" => [
  108. "yes" => "Yes", // qadf=none
  109. "no" => "No" // qadf=heavy
  110. ]
  111. ],
  112. "size" => [ // flimgsize
  113. "display" => "Size",
  114. "option" => [
  115. "any" => "Any size",
  116. "Small" => "Small",
  117. "Medium" => "Medium",
  118. "Large" => "Large",
  119. "Wallpaper" => "Wallpaper",
  120. // from here, image-size-select, var prefix = isz:lt,islt:
  121. "qsvgs" => "Larger than 400x300",
  122. "vga" => "Larger than 640x480",
  123. "svga" => "Larger than 800x600",
  124. "xga" => "Larger than 1024x768",
  125. "qsvgs" => "Larger than 400x300",
  126. "2mp" => "Larger than 2 MP (1600x1200)",
  127. "4mp" => "Larger than 4 MP (2272x1704)",
  128. "6mp" => "Larger than 6 MP (2816x2112)",
  129. "8mp" => "Larger than 8 MP (3264x2448)",
  130. "10mp" => "Larger than 10 MP (3648x2736)",
  131. "12mp" => "Larger than 12 MP (4096x3072)",
  132. "15mp" => "Larger than 15 MP (4480x3360)",
  133. "20mp" => "Larger than 20 MP (5120x3840)",
  134. "40mp" => "Larger than 40 MP (7216x5412)",
  135. "70mp" => "Larger than 70 MP (9600x7200)"
  136. ]
  137. ],
  138. "color" => [ // flimgcolor
  139. "display" => "Color",
  140. "option" => [
  141. "any" => "Any color",
  142. // from here, var prefix = ic:
  143. "color" => "Color only",
  144. "bnw" => "Black & white", // set to "gray"
  145. // from here, var prefix = ic:specific,isc:
  146. "red" => "Red",
  147. "orange" => "Orange",
  148. "yellow" => "Yellow",
  149. "green" => "Green",
  150. "teal" => "Teal",
  151. "blue" => "Blue",
  152. "purple" => "Purple",
  153. "pink" => "Pink",
  154. "white" => "White",
  155. "gray" => "Gray",
  156. "black" => "Black",
  157. "brown" => "Brown"
  158. ]
  159. ],
  160. "type" => [ // flimgtype
  161. "display" => "Type",
  162. "option" => [
  163. "any" => "Any type",
  164. "AnimatedGif" => "Animated GIF",
  165. "Clipart" => "Clip Art",
  166. "Line" => "Line Drawing",
  167. "Photo" => "Photograph",
  168. "Transparent" => "Transparent Background"
  169. ]
  170. ],
  171. "license" => [ // flimglicense
  172. "display" => "License",
  173. "option" => [
  174. "any" => "Any license",
  175. "p" => "Public domain",
  176. "s" => "Free to share",
  177. "sc" => "Free to share commercially",
  178. "m" => "Free to modify",
  179. "mc" => "Free to modify commercially"
  180. ]
  181. ]
  182. ];
  183. break;
  184. case "videos":
  185. return [
  186. "nsfw" => [ // qadf
  187. "display" => "NSFW",
  188. "option" => [
  189. "yes" => "Yes", // qadf=none
  190. "no" => "No" // qadf=heavy
  191. ]
  192. ],
  193. "sort" => [
  194. "display" => "Sort by",
  195. "option" => [
  196. "relevance" => "Most relevant",
  197. "popular" => "Most popular",
  198. "recent" => "Most recent"
  199. ]
  200. ],
  201. "duration" => [ // with_duration
  202. "display" => "Duration",
  203. "option" => [
  204. "any" => "Any duration",
  205. "short" => "Short",
  206. "medium" => "Medium",
  207. "long" => "Long"
  208. ]
  209. ]
  210. ];
  211. break;
  212. case "news":
  213. return [
  214. "nsfw" => [ // qadf
  215. "display" => "NSFW",
  216. "option" => [
  217. "yes" => "Yes", // qadf=none
  218. "no" => "No" // qadf=heavy
  219. ]
  220. ],
  221. "time" => [ // with_date
  222. "display" => "Time posted",
  223. "option" => [
  224. "any" => "Any time",
  225. "d" => "Past 24 hours",
  226. "w" => "Past week",
  227. "m" => "Past month"
  228. ]
  229. ]
  230. ];
  231. break;
  232. //preferences=date_timeEEEworldN1Ndisable_family_filterEEE1N1Ndisable_open_in_new_windowEEE0N1Nenable_post_methodEEE1N1Nenable_proxy_safety_suggestEEE0N1Nenable_stay_controlEEE0N1Ninstant_answersEEE1N1Nlang_homepageEEEs%2Fdevice%2FenN1NlanguageEEEazerbaijaniN1Nlanguage_uiEEEenglishN1Nnum_of_resultsEEE20N1Nsearch_results_regionEEEallN1NsuggestionsEEE1N1Nwt_unitEEEcelsius; Domain=startpage.com; Expires=Mon, 28 Oct 2024 20:21:58 GMT; Secure; Path=/
  233. //preferences=date_timeEEEworldN1Ndisable_family_filterEEE1N1Ndisable_open_in_new_windowEEE0N1Nenable_post_methodEEE1N1Nenable_proxy_safety_suggestEEE0N1Nenable_stay_controlEEE0N1Ninstant_answersEEE1N1Nlang_homepageEEEs%2Fdevice%2FenN1NlanguageEEEenglishN1Nlanguage_uiEEEenglishN1Nnum_of_resultsEEE20N1Nsearch_results_regionEEEallN1NsuggestionsEEE1N1Nwt_unitEEEcelsius; Domain=startpage.com; Expires=Mon, 28 Oct 2024 20:22:52 GMT; Secure; Path=/
  234. }
  235. }
  236. private function get($proxy, $url, $get = [], $post = false, $is_xhr = false){
  237. $curlproc = curl_init();
  238. if($post === true){
  239. curl_setopt($curlproc, CURLOPT_POST, true);
  240. curl_setopt($curlproc, CURLOPT_POSTFIELDS, $get);
  241. }elseif($get !== []){
  242. $get = http_build_query($get);
  243. $url .= "?" . $get;
  244. }
  245. curl_setopt($curlproc, CURLOPT_URL, $url);
  246. // http2 bypass
  247. curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
  248. curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
  249. if($is_xhr === true){
  250. curl_setopt($curlproc, CURLOPT_HTTPHEADER,
  251. ["User-Agent: " . config::USER_AGENT,
  252. "Accept: application/json",
  253. "Accept-Language: en-US,en;q=0.5",
  254. "Accept-Encoding: gzip",
  255. "Referer: https://www.startpage.com/",
  256. "Content-Type: application/json",
  257. "Content-Length: " . strlen($get),
  258. "Origin: https://www.startpage.com/",
  259. "DNT: 1",
  260. "Connection: keep-alive",
  261. "Cookie: preferences=date_timeEEEworldN1Ndisable_family_filterEEE1N1Ndisable_open_in_new_windowEEE0N1Nenable_post_methodEEE1N1Nenable_proxy_safety_suggestEEE0N1Nenable_stay_controlEEE0N1Ninstant_answersEEE1N1Nlang_homepageEEEs%2Fdevice%2FenN1NlanguageEEEenglishN1Nlanguage_uiEEEenglishN1Nnum_of_resultsEEE20N1Nsearch_results_regionEEEallN1NsuggestionsEEE1N1Nwt_unitEEEcelsius",
  262. "Sec-Fetch-Dest: empty",
  263. "Sec-Fetch-Mode: cors",
  264. "Sec-Fetch-Site: same-origin",
  265. "TE: trailers"]
  266. );
  267. }elseif($post === true){
  268. curl_setopt($curlproc, CURLOPT_HTTPHEADER,
  269. ["User-Agent: " . config::USER_AGENT,
  270. "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
  271. "Accept-Language: en-US,en;q=0.5",
  272. "Accept-Encoding: gzip",
  273. "Referer: https://www.startpage.com/",
  274. "Content-Type: application/x-www-form-urlencoded",
  275. "Content-Length: " . strlen($get),
  276. "DNT: 1",
  277. "Connection: keep-alive",
  278. "Cookie: preferences=date_timeEEEworldN1Ndisable_family_filterEEE1N1Ndisable_open_in_new_windowEEE0N1Nenable_post_methodEEE1N1Nenable_proxy_safety_suggestEEE0N1Nenable_stay_controlEEE0N1Ninstant_answersEEE1N1Nlang_homepageEEEs%2Fdevice%2FenN1NlanguageEEEenglishN1Nlanguage_uiEEEenglishN1Nnum_of_resultsEEE20N1Nsearch_results_regionEEEallN1NsuggestionsEEE1N1Nwt_unitEEEcelsius",
  279. "Upgrade-Insecure-Requests: 1",
  280. "Sec-Fetch-Dest: document",
  281. "Sec-Fetch-Mode: navigate",
  282. "Sec-Fetch-Site: none",
  283. "Sec-Fetch-User: ?1",
  284. "Priority: u=0, i",
  285. "TE: trailers"]
  286. );
  287. }else{
  288. curl_setopt($curlproc, CURLOPT_HTTPHEADER,
  289. ["User-Agent: " . config::USER_AGENT,
  290. "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
  291. "Accept-Language: en-US,en;q=0.5",
  292. "Accept-Encoding: gzip",
  293. "DNT: 1",
  294. "Connection: keep-alive",
  295. "Cookie: preferences=date_timeEEEworldN1Ndisable_family_filterEEE1N1Ndisable_open_in_new_windowEEE0N1Nenable_post_methodEEE1N1Nenable_proxy_safety_suggestEEE0N1Nenable_stay_controlEEE0N1Ninstant_answersEEE1N1Nlang_homepageEEEs%2Fdevice%2FenN1NlanguageEEEenglishN1Nlanguage_uiEEEenglishN1Nnum_of_resultsEEE20N1Nsearch_results_regionEEEallN1NsuggestionsEEE1N1Nwt_unitEEEcelsius",
  296. "Sec-Fetch-Dest: document",
  297. "Sec-Fetch-Mode: navigate",
  298. "Sec-Fetch-Site: none",
  299. "Sec-Fetch-User: ?1",
  300. "Priority: u=0, i",
  301. "TE: trailers"]
  302. );
  303. }
  304. curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
  305. curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
  306. curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
  307. curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
  308. curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
  309. $this->backend->assign_proxy($curlproc, $proxy);
  310. $data = curl_exec($curlproc);
  311. if(curl_errno($curlproc)){
  312. throw new Exception(curl_error($curlproc));
  313. }
  314. curl_close($curlproc);
  315. return $data;
  316. }
  317. public function web($get){
  318. if($get["npt"]){
  319. [$post, $proxy] = $this->backend->get($get["npt"], "web");
  320. try{
  321. $html = $this->get(
  322. $proxy,
  323. "https://www.startpage.com/sp/search",
  324. $post,
  325. true
  326. );
  327. }catch(Exception $error){
  328. throw new Exception("Failed to fetch search page");
  329. }
  330. $get_instant_answer = false;
  331. }else{
  332. $proxy = $this->backend->get_ip();
  333. $params = [
  334. "query" => $get["s"],
  335. "cat" => "web",
  336. "pl" => "opensearch"
  337. ];
  338. if($get["nsfw"] == "no"){
  339. $params["qadf"] = "heavy";
  340. $get_instant_answer = false;
  341. }else{
  342. $get_instant_answer = true;
  343. }
  344. if($get["country"] !== "any"){
  345. $params["qsr"] = $get["country"];
  346. }
  347. if($get["time"] !== "any"){
  348. $params["with_date"] = $get["time"];
  349. }
  350. try{
  351. $html = $this->get(
  352. $proxy,
  353. "https://www.startpage.com/sp/search",
  354. $params
  355. );
  356. }catch(Exception $error){
  357. throw new Exception("Failed to fetch search page");
  358. }
  359. //$html = file_get_contents("scraper/startpage.html");
  360. }
  361. $this->detect_captcha($html);
  362. if(
  363. preg_match(
  364. '/React\.createElement\(UIStartpage\.AppSerpWeb, ?(.+)\),?$/m',
  365. $html,
  366. $matches
  367. ) === 0
  368. ){
  369. throw new Exception("Failed to grep JSON object");
  370. }
  371. $json = json_decode($matches[1], true);
  372. if($json === null){
  373. throw new Exception("Failed to decode JSON");
  374. }
  375. //print_r($json);
  376. $out = [
  377. "status" => "ok",
  378. "spelling" => [
  379. "type" => "no_correction",
  380. "using" => null,
  381. "correction" => null
  382. ],
  383. "npt" => null,
  384. "answer" => [],
  385. "web" => [],
  386. "image" => [],
  387. "video" => [],
  388. "news" => [],
  389. "related" => []
  390. ];
  391. // get npt
  392. $out["npt"] = $this->parse_npt($json, "web", $proxy);
  393. foreach($json["render"]["presenter"]["regions"]["mainline"] as $category){
  394. if(!isset($category["display_type"])){
  395. continue;
  396. }
  397. switch($category["display_type"]){
  398. case "web-google":
  399. foreach($category["results"] as $result){
  400. $sublinks = [];
  401. foreach($result["siteLinks"] as $sublink){
  402. $sublinks[] = [
  403. "title" => $sublink["title"],
  404. "description" => null,
  405. "url" => $sublink["clickUrl"]
  406. ];
  407. }
  408. $description =
  409. explode(
  410. "...",
  411. $this->titledots(
  412. html_entity_decode(
  413. $this->fuckhtml
  414. ->getTextContent(
  415. $result["description"]
  416. )
  417. )
  418. ),
  419. 2
  420. );
  421. $date = strtotime(trim($description[0]));
  422. if(
  423. $date === false ||
  424. count($description) !== 2 ||
  425. strlen($description[0]) > 14
  426. ){
  427. // no date found
  428. $description =
  429. implode(
  430. " ... ",
  431. $description
  432. );
  433. $date = null;
  434. }else{
  435. // date found
  436. $description = ltrim($description[1]);
  437. }
  438. $out["web"][] = [
  439. "title" =>
  440. $this->titledots(
  441. html_entity_decode(
  442. $this->fuckhtml
  443. ->getTextContent(
  444. $result["title"]
  445. )
  446. )
  447. ),
  448. "description" => $description,
  449. "url" => $result["clickUrl"],
  450. "date" => $date,
  451. "type" => "web",
  452. "thumb" => [
  453. "url" => null,
  454. "ratio" => null
  455. ],
  456. "sublink" => $sublinks,
  457. "table" => []
  458. ];
  459. }
  460. break;
  461. case "images-qi-top":
  462. foreach($category["results"] as $result){
  463. $out["image"][] = [
  464. "title" =>
  465. $this->titledots(
  466. html_entity_decode(
  467. $this->fuckhtml
  468. ->getTextContent(
  469. $result["title"]
  470. )
  471. )
  472. ),
  473. "source" => [
  474. [
  475. "url" => $result["rawImageUrl"],
  476. "width" => (int)$result["width"],
  477. "height" => (int)$result["height"]
  478. ],
  479. [
  480. "url" => $this->unshitimage($result["mdThumbnailUrl"]),
  481. "width" => (int)$result["mdThumbnailWidth"],
  482. "height" => (int)$result["mdThumbnailHeight"]
  483. ]
  484. ],
  485. "url" =>
  486. $result["altClickUrl"]
  487. ];
  488. }
  489. break;
  490. case "spellsuggest-google":
  491. $out["spelling"] =
  492. [
  493. "type" => "including",
  494. "using" => $json["render"]["query"],
  495. "correction" => $category["results"][0]["query"]
  496. ];
  497. break;
  498. case "dictionary-qi":
  499. foreach($category["results"] as $result){
  500. $answer = [
  501. "title" => $result["word"],
  502. "description" => [],
  503. "url" => null,
  504. "thumb" => null,
  505. "table" => [],
  506. "sublink" => []
  507. ];
  508. foreach($result["lexical_categories"] as $lexic_type => $definitions){
  509. $answer["description"][] = [
  510. "type" => "title",
  511. "value" => $lexic_type
  512. ];
  513. $i = 0;
  514. foreach($definitions as $definition){
  515. $text_definition = trim($definition["definition"]);
  516. $text_example = trim($definition["example"]);
  517. $text_synonyms = implode(", ", $definition["synonyms"]);
  518. if($text_definition != ""){
  519. $i++;
  520. $c = count($answer["description"]) - 1;
  521. if(
  522. $c !== 0 &&
  523. $answer["description"][$c]["type"] == "text"
  524. ){
  525. $answer["description"][$c]["value"] .=
  526. "\n\n" . $i . ". " . $text_definition;
  527. }else{
  528. $answer["description"][] = [
  529. "type" => "text",
  530. "value" => $i . ". " . $text_definition
  531. ];
  532. }
  533. }
  534. if($text_example != ""){
  535. $answer["description"][] = [
  536. "type" => "quote",
  537. "value" => $text_example
  538. ];
  539. }
  540. if($text_synonyms != ""){
  541. $answer["description"][] = [
  542. "type" => "text",
  543. "value" => "Synonyms: " . $text_synonyms
  544. ];
  545. }
  546. }
  547. }
  548. $out["answer"][] = $answer;
  549. }
  550. break;
  551. }
  552. }
  553. // parse instant answers
  554. if(
  555. $get["extendedsearch"] == "yes" &&
  556. $get_instant_answer === true
  557. ){
  558. // https://www.startpage.com/sp/qi?qimsn=ex&sxap=%2Fv1%2Fquery&sc=BqZ3inqrAgF701&sr=1
  559. try{
  560. $post = [
  561. "se" => "n0vze2y9dqwy",
  562. "q" => $json["render"]["query"],
  563. "results" => [], // populate
  564. "enableKnowledgePanel" => true,
  565. "enableMediaThumbBar" => false,
  566. "enableSearchSuggestions" => false,
  567. "enableTripadvisorProperties" => [],
  568. "enableTripadvisorPlaces" => [],
  569. "enableTripadvisorPlacesForLocations" => [],
  570. "enableWebProducts" => false,
  571. "tripadvisorPartnerId" => null,
  572. "tripadvisorMapColorMode" => "light",
  573. "tripadvisorDisablesKnowledgePanel" => false,
  574. "instantAnswers" => [
  575. "smartAnswers",
  576. "youtube",
  577. "tripadvisor"
  578. ],
  579. "iaType" => null,
  580. "forceEnhancedKnowledgePanel" => false,
  581. "shoppingOnly" => false,
  582. "allowAdultProducts" => true,
  583. "lang" => "en",
  584. "browserLang" => "en-US",
  585. "browserTimezone" => "America/New_York",
  586. "market" => null,
  587. "userLocation" => null,
  588. "userDate" => date("Y-m-d"),
  589. "userAgentType" => "unknown"
  590. ];
  591. foreach($out["web"] as $result){
  592. $post["results"][] = [
  593. "url" => $result["url"],
  594. "title" => $result["title"]
  595. ];
  596. }
  597. $post = json_encode($post, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_IGNORE);
  598. $additional_data =
  599. $this->get(
  600. $proxy,
  601. "https://www.startpage.com/sp/qi?qimsn=ex&sxap=%2Fv1%2Fquery&sc=" . $json["render"]["callback_sc"] . "&sr=1",
  602. $post,
  603. true,
  604. true
  605. );
  606. $additional_data = json_decode($additional_data, true);
  607. if($additional_data === null){
  608. throw new Exception("Failed to decode JSON"); // just break out, dont fail completely
  609. }
  610. if(!isset($additional_data["knowledgePanel"])){
  611. throw new Exception("Response has missing data (knowledgePanel)");
  612. }
  613. $additional_data = $additional_data["knowledgePanel"];
  614. $answer = [
  615. "title" => $additional_data["meta"]["title"],
  616. "description" => [
  617. [
  618. "type" => "quote",
  619. "value" => $additional_data["meta"]["description"]
  620. ]
  621. ],
  622. "url" => $additional_data["meta"]["origWikiUrl"],
  623. "thumb" => $additional_data["meta"]["image"],
  624. "table" => [],
  625. "sublink" => []
  626. ];
  627. // parse html for instant answer
  628. $this->fuckhtml->load($additional_data["html"]);
  629. $div =
  630. $this->fuckhtml
  631. ->getElementsByTagName(
  632. "div"
  633. );
  634. // get description
  635. $description =
  636. $this->fuckhtml
  637. ->getElementsByClassName(
  638. "sx-kp-short-extract sx-kp-short-extract-complete",
  639. $div
  640. );
  641. if(count($description) !== 0){
  642. $answer["description"][] = [
  643. "type" => "text",
  644. "value" =>
  645. html_entity_decode(
  646. $this->fuckhtml
  647. ->getTextContent(
  648. $description[0]
  649. )
  650. )
  651. ];
  652. }
  653. // get socials
  654. $socials =
  655. $this->fuckhtml
  656. ->getElementsByClassName(
  657. "sx-wiki-social-link",
  658. "a"
  659. );
  660. foreach($socials as $social){
  661. $title =
  662. $this->fuckhtml
  663. ->getTextContent(
  664. $social["attributes"]["title"]
  665. );
  666. $url =
  667. $this->fuckhtml
  668. ->getTextContent(
  669. $social["attributes"]["href"]
  670. );
  671. switch($title){
  672. case "Official Website":
  673. $title = "Website";
  674. break;
  675. }
  676. $answer["sublink"][$title] = $url;
  677. }
  678. // get videos
  679. $videos =
  680. $this->fuckhtml
  681. ->getElementsByClassName(
  682. "sx-kp-video-grid-item",
  683. $div
  684. );
  685. foreach($videos as $video){
  686. $this->fuckhtml->load($video);
  687. $as =
  688. $this->fuckhtml
  689. ->getElementsByTagName(
  690. "a"
  691. );
  692. if(count($as) === 0){
  693. // ?? invalid
  694. continue;
  695. }
  696. $image =
  697. $this->fuckhtml
  698. ->getElementsByAttributeName(
  699. "data-sx-src",
  700. "img"
  701. );
  702. if(count($image) !== 0){
  703. $thumb = [
  704. "ratio" => "16:9",
  705. "url" =>
  706. $this->fuckhtml
  707. ->getTextContent(
  708. $image[0]["attributes"]["data-sx-src"]
  709. )
  710. ];
  711. }else{
  712. $thumb = [
  713. "ratio" => null,
  714. "url" => null
  715. ];
  716. }
  717. $out["video"][] = [
  718. "title" =>
  719. $this->fuckhtml
  720. ->getTextContent(
  721. $as[0]["attributes"]["title"]
  722. ),
  723. "description" => null,
  724. "date" => null,
  725. "duration" => null,
  726. "views" => null,
  727. "thumb" => $thumb,
  728. "url" =>
  729. $this->fuckhtml
  730. ->getTextContent(
  731. $as[0]["attributes"]["href"]
  732. )
  733. ];
  734. }
  735. // reset
  736. $this->fuckhtml->load($additional_data["html"]);
  737. // get table elements
  738. $table =
  739. $this->fuckhtml
  740. ->getElementsByClassName(
  741. "sx-infobox",
  742. "table"
  743. );
  744. if(count($table) !== 0){
  745. $trs =
  746. $this->fuckhtml
  747. ->getElementsByTagName(
  748. "tr"
  749. );
  750. foreach($trs as $tr){
  751. $this->fuckhtml->load($tr);
  752. // ok so startpage devs cant fucking code a table
  753. // td = content
  754. // th (AAAHH) = title
  755. $tds =
  756. $this->fuckhtml
  757. ->getElementsByTagName(
  758. "td"
  759. );
  760. $ths =
  761. $this->fuckhtml
  762. ->getElementsByTagName(
  763. "th"
  764. );
  765. if(
  766. count($ths) === 1 &&
  767. count($tds) === 1
  768. ){
  769. $title =
  770. $this->fuckhtml
  771. ->getTextContent(
  772. $ths[0]
  773. );
  774. $description = [];
  775. $this->fuckhtml->load($tds[0]);
  776. $lis =
  777. $this->fuckhtml
  778. ->getElementsByTagName(
  779. "li"
  780. );
  781. if(count($lis) !== 0){
  782. foreach($lis as $li){
  783. $description[] =
  784. $this->fuckhtml
  785. ->getTextContent(
  786. $li
  787. );
  788. }
  789. $description = implode(", ", $description);
  790. }else{
  791. $description =
  792. $this->fuckhtml
  793. ->getTextContent(
  794. $tds[0]
  795. );
  796. }
  797. $answer["table"][$title] = $description;
  798. }
  799. }
  800. }
  801. $out["answer"][] = $answer;
  802. }catch(Exception $error){
  803. // do nothing
  804. //echo "error!";
  805. }
  806. }
  807. return $out;
  808. }
  809. public function image($get){
  810. if($get["npt"]){
  811. [$post, $proxy] = $this->backend->get($get["npt"], "images");
  812. try{
  813. $html = $this->get(
  814. $proxy,
  815. "https://www.startpage.com/sp/search",
  816. $post,
  817. true
  818. );
  819. }catch(Exception $error){
  820. throw new Exception("Failed to fetch search page");
  821. }
  822. }else{
  823. $search = $get["s"];
  824. if(strlen($search) === 0){
  825. throw new Exception("Search term is empty!");
  826. }
  827. try{
  828. $proxy = $this->backend->get_ip();
  829. $params = [
  830. "query" => $get["s"],
  831. "cat" => "images",
  832. "pl" => "opensearch"
  833. ];
  834. if($get["nsfw"] == "no"){
  835. $params["qadf"] = "heavy";
  836. }
  837. if($get["size"] != "any"){
  838. if(
  839. $get["size"] == "Small" ||
  840. $get["size"] == "Medium" ||
  841. $get["size"] == "Large" ||
  842. $get["size"] == "Wallpaper"
  843. ){
  844. $params["flimgsize"] = $get["size"];
  845. }else{
  846. $params["image-size-select"] = "isz:lt,islt:" . $get["size"];
  847. }
  848. }
  849. if($get["color"] != "any"){
  850. if($get["color"] == "color"){
  851. $params["flimgcolor"] = "ic:color";
  852. }elseif($get["color"] == "bnw"){
  853. $params["flimgcolor"] = "ic:gray";
  854. }else{
  855. $params["flimgcolor"] = "ic:specific,isc:" . $get["color"];
  856. }
  857. }
  858. if($get["type"] != "any"){
  859. $params["flimgtype"] = $get["type"];
  860. }
  861. if($get["license"] != "any"){
  862. $params["flimglicense"] = $get["license"];
  863. }
  864. try{
  865. $html = $this->get(
  866. $proxy,
  867. "https://www.startpage.com/sp/search",
  868. $params
  869. );
  870. }catch(Exception $error){
  871. throw new Exception("Failed to fetch search page");
  872. }
  873. //$html = file_get_contents("scraper/startpage.html");
  874. }catch(Exception $error){
  875. throw new Exception("Failed to fetch search page");
  876. }
  877. }
  878. $this->detect_captcha($html);
  879. $out = [
  880. "status" => "ok",
  881. "npt" => null,
  882. "image" => []
  883. ];
  884. if(
  885. preg_match(
  886. '/React\.createElement\(UIStartpage\.AppSerpImages, ?(.+)\),?$/m',
  887. $html,
  888. $matches
  889. ) === 0
  890. ){
  891. throw new Exception("Failed to grep JSON object");
  892. }
  893. $json = json_decode($matches[1], true);
  894. if($json === null){
  895. throw new Exception("Failed to decode JSON object");
  896. }
  897. // get npt
  898. $out["npt"] = $this->parse_npt($json, "images", $proxy);
  899. // get images
  900. foreach($json["render"]["presenter"]["regions"]["mainline"] as $category){
  901. if($category["display_type"] != "images-bing"){
  902. // ignore ads and !! suggestions !! @todo
  903. continue;
  904. }
  905. foreach($category["results"] as $image){
  906. $out["image"][] = [
  907. "title" => $this->titledots($image["title"]),
  908. "source" => [
  909. [
  910. "url" => $this->unshitimage($image["clickUrl"]),
  911. "width" => (int)$image["width"],
  912. "height" => (int)$image["height"]
  913. ],
  914. [
  915. "url" => $this->unshitimage($image["thumbnailUrl"]),
  916. "width" => (int)$image["thumbnailWidth"],
  917. "height" => (int)$image["thumbnailHeight"]
  918. ]
  919. ],
  920. "url" => $image["altClickUrl"]
  921. ];
  922. }
  923. }
  924. return $out;
  925. }
  926. public function video($get){
  927. if($get["npt"]){
  928. [$post, $proxy] = $this->backend->get($get["npt"], "videos");
  929. try{
  930. $html = $this->get(
  931. $proxy,
  932. "https://www.startpage.com/sp/search",
  933. $post,
  934. true
  935. );
  936. }catch(Exception $error){
  937. throw new Exception("Failed to fetch search page");
  938. }
  939. }else{
  940. $search = $get["s"];
  941. if(strlen($search) === 0){
  942. throw new Exception("Search term is empty!");
  943. }
  944. try{
  945. $proxy = $this->backend->get_ip();
  946. $params = [
  947. "query" => $get["s"],
  948. "cat" => "video",
  949. "pl" => "opensearch"
  950. ];
  951. if($get["nsfw"] == "no"){
  952. $params["qadf"] = "heavy";
  953. }
  954. if($get["sort"] != "relevance"){
  955. $params["sort_by"] = $get["sort"];
  956. }
  957. if($get["duration"] != "any"){
  958. $params["with_duration"] = $get["duration"];
  959. }
  960. try{
  961. $html = $this->get(
  962. $proxy,
  963. "https://www.startpage.com/sp/search",
  964. $params
  965. );
  966. }catch(Exception $error){
  967. throw new Exception("Failed to fetch search page");
  968. }
  969. //$html = file_get_contents("scraper/startpage.html");
  970. }catch(Exception $error){
  971. throw new Exception("Failed to fetch search page");
  972. }
  973. }
  974. $this->detect_captcha($html);
  975. if(
  976. preg_match(
  977. '/React\.createElement\(UIStartpage\.AppSerpVideos, ?(.+)\),?$/m',
  978. $html,
  979. $matches
  980. ) === 0
  981. ){
  982. throw new Exception("Failed to get JSON object");
  983. }
  984. $json = json_decode($matches[1], true);
  985. if($json === null){
  986. throw new Exception("Failed to decode JSON object");
  987. }
  988. $out = [
  989. "status" => "ok",
  990. "npt" => null,
  991. "video" => [],
  992. "author" => [],
  993. "livestream" => [],
  994. "playlist" => [],
  995. "reel" => []
  996. ];
  997. // get npt
  998. $out["npt"] = $this->parse_npt($json, "video", $proxy);
  999. // get results
  1000. foreach($json["render"]["presenter"]["regions"]["mainline"] as $category){
  1001. if($category["display_type"] == "video-youtube"){
  1002. foreach($category["results"] as $video){
  1003. if(
  1004. isset($video["thumbnailUrl"]) &&
  1005. $video["thumbnailUrl"] !== null
  1006. ){
  1007. $thumb = [
  1008. "ratio" => "16:9",
  1009. "url" => $this->unshitimage($video["thumbnailUrl"])
  1010. ];
  1011. }else{
  1012. $thumb = [
  1013. "ratio" => null,
  1014. "url" => null
  1015. ];
  1016. }
  1017. $out["video"][] = [
  1018. "title" => $video["title"],
  1019. "description" => $this->limitstrlen($video["description"]),
  1020. "author" => [
  1021. "name" => $video["channelTitle"],
  1022. "url" => null,
  1023. "avatar" => null
  1024. ],
  1025. "date" => strtotime($video["publishDate"]),
  1026. "duration" => $this->hms2int($video["duration"]),
  1027. "views" => (int)$video["viewCount"],
  1028. "thumb" => $thumb,
  1029. "url" => $video["clickUrl"]
  1030. ];
  1031. }
  1032. }
  1033. }
  1034. return $out;
  1035. }
  1036. public function news($get){
  1037. if($get["npt"]){
  1038. [$post, $proxy] = $this->backend->get($get["npt"], "news");
  1039. try{
  1040. $html = $this->get(
  1041. $proxy,
  1042. "https://www.startpage.com/sp/search",
  1043. $post,
  1044. true
  1045. );
  1046. }catch(Exception $error){
  1047. throw new Exception("Failed to fetch search page");
  1048. }
  1049. }else{
  1050. $search = $get["s"];
  1051. if(strlen($search) === 0){
  1052. throw new Exception("Search term is empty!");
  1053. }
  1054. try{
  1055. $proxy = $this->backend->get_ip();
  1056. $params = [
  1057. "query" => $get["s"],
  1058. "cat" => "news",
  1059. "pl" => "opensearch"
  1060. ];
  1061. if($get["nsfw"] == "no"){
  1062. $params["qadf"] = "heavy";
  1063. }
  1064. if($get["time"] != "any"){
  1065. $params["with_date"] = $get["time"];
  1066. }
  1067. try{
  1068. $html = $this->get(
  1069. $proxy,
  1070. "https://www.startpage.com/sp/search",
  1071. $params
  1072. );
  1073. }catch(Exception $error){
  1074. throw new Exception("Failed to fetch search page");
  1075. }
  1076. //$html = file_get_contents("scraper/startpage.html");
  1077. }catch(Exception $error){
  1078. throw new Exception("Failed to fetch search page");
  1079. }
  1080. }
  1081. $this->detect_captcha($html);
  1082. if(
  1083. preg_match(
  1084. '/React\.createElement\(UIStartpage\.AppSerpNews, ?(.+)\),?$/m',
  1085. $html,
  1086. $matches
  1087. ) === 0
  1088. ){
  1089. throw new Exception("Failed to get JSON object");
  1090. }
  1091. $json = json_decode($matches[1], true);
  1092. if($json === null){
  1093. throw new Exception("Failed to decode JSON object");
  1094. }
  1095. $out = [
  1096. "status" => "ok",
  1097. "npt" => null,
  1098. "news" => []
  1099. ];
  1100. // get npt
  1101. $out["npt"] = $this->parse_npt($json, "news", $proxy);
  1102. foreach($json["render"]["presenter"]["regions"]["mainline"] as $category){
  1103. if($category["display_type"] != "news-bing"){
  1104. // unsupported category
  1105. continue;
  1106. }
  1107. foreach($category["results"] as $news){
  1108. if(
  1109. isset($news["thumbnailUrl"]) &&
  1110. $news["thumbnailUrl"] !== null
  1111. ){
  1112. $thumb = [
  1113. "ratio" => "16:9",
  1114. "url" => $this->unshitimage($news["thumbnailUrl"])
  1115. ];
  1116. }else{
  1117. $thumb = [
  1118. "ratio" => null,
  1119. "url" => null
  1120. ];
  1121. }
  1122. $out["news"][] = [
  1123. "title" => $this->titledots($this->remove_penguins($news["title"])),
  1124. "author" => $news["source"],
  1125. "description" => $this->titledots($this->remove_penguins($news["description"])),
  1126. "date" => (int)substr((string)$news["date"], 0, -3),
  1127. "thumb" => $thumb,
  1128. "url" => $news["clickUrl"]
  1129. ];
  1130. }
  1131. }
  1132. return $out;
  1133. }
  1134. private function parse_npt($json, $pagetype, $proxy){
  1135. foreach($json["render"]["presenter"]["pagination"]["pages"] as $page){
  1136. if($page["name"] == "Next"){
  1137. parse_str(
  1138. explode(
  1139. "?",
  1140. $page["url"],
  1141. 2
  1142. )[1],
  1143. $str
  1144. );
  1145. return
  1146. $this->backend->store(
  1147. http_build_query(
  1148. [
  1149. "lui" => "english",
  1150. "language" => "english",
  1151. "query" => $str["q"],
  1152. "cat" => $pagetype,
  1153. "sc" => $str["sc"],
  1154. "t" => "device",
  1155. "segment" => "startpage.udog",
  1156. "page" => $str["page"]
  1157. ]
  1158. ),
  1159. $pagetype,
  1160. $proxy
  1161. );
  1162. break;
  1163. }
  1164. }
  1165. return null;
  1166. }
  1167. private function unshitimage($url){
  1168. $query = parse_url($url, PHP_URL_QUERY);
  1169. parse_str($query, $query);
  1170. if(isset($query["piurl"])){
  1171. if(strpos($query["piurl"], "gstatic.com/")){
  1172. return
  1173. explode(
  1174. "&",
  1175. $query["piurl"],
  1176. 2
  1177. )[0];
  1178. }
  1179. if(
  1180. strpos($query["piurl"], "bing.net/") ||
  1181. strpos($query["piurl"], "bing.com/")
  1182. ){
  1183. return
  1184. explode(
  1185. "&",
  1186. $query["piurl"],
  1187. 2
  1188. )[0];
  1189. }
  1190. return $query["piurl"];
  1191. }
  1192. return $url;
  1193. }
  1194. private function limitstrlen($text){
  1195. return
  1196. explode(
  1197. "\n",
  1198. wordwrap(
  1199. str_replace(
  1200. ["\n\r", "\r\n", "\n", "\r"],
  1201. " ",
  1202. $text
  1203. ),
  1204. 300,
  1205. "\n"
  1206. ),
  1207. 2
  1208. )[0];
  1209. }
  1210. private function titledots($title){
  1211. return trim($title, " .\t\n\r\0\x0B…");
  1212. }
  1213. private function hms2int($time){
  1214. $parts = explode(":", $time, 3);
  1215. $time = 0;
  1216. if(count($parts) === 3){
  1217. // hours
  1218. $time = $time + ((int)$parts[0] * 3600);
  1219. array_shift($parts);
  1220. }
  1221. if(count($parts) === 2){
  1222. // minutes
  1223. $time = $time + ((int)$parts[0] * 60);
  1224. array_shift($parts);
  1225. }
  1226. // seconds
  1227. $time = $time + (int)$parts[0];
  1228. return $time;
  1229. }
  1230. private function remove_penguins($text){
  1231. return str_replace(
  1232. ["", ""],
  1233. "",
  1234. $text
  1235. );
  1236. }
  1237. private function detect_captcha($html){
  1238. $this->fuckhtml->load($html);
  1239. $title =
  1240. $this->fuckhtml
  1241. ->getElementsByTagName(
  1242. "title"
  1243. );
  1244. if(
  1245. count($title) !== 0 &&
  1246. $title[0]["innerHTML"] == "Redirecting..."
  1247. ){
  1248. // check if it's a captcha
  1249. $as =
  1250. $this->fuckhtml
  1251. ->getElementsByTagName(
  1252. "a"
  1253. );
  1254. foreach($as as $a){
  1255. if(
  1256. strpos(
  1257. $this->fuckhtml
  1258. ->getTextContent(
  1259. $a["innerHTML"]
  1260. ),
  1261. "https://www.startpage.com/sp/captcha"
  1262. ) !== false
  1263. ){
  1264. throw new Exception("Startpage returned a captcha");
  1265. }
  1266. }
  1267. throw new Exception("Startpage redirected the scraper to an unhandled page");
  1268. }
  1269. }
  1270. }