google_cse.php 25 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054
  1. <?php
  2. class google_cse{
  3. public const req_html = 0;
  4. public const req_js = 1;
  5. public function __construct(){
  6. include "lib/backend.php";
  7. $this->backend = new backend("google_cse");
  8. include "lib/fuckhtml.php";
  9. $this->fuckhtml = new fuckhtml();
  10. }
  11. public function getfilters($page){
  12. $base = [
  13. "country" => [ // gl=<country> (image: cr=countryAF)
  14. "display" => "Country",
  15. "option" => [
  16. "any" => "Any country",
  17. "af" => "Afghanistan",
  18. "al" => "Albania",
  19. "dz" => "Algeria",
  20. "as" => "American Samoa",
  21. "ad" => "Andorra",
  22. "ao" => "Angola",
  23. "ai" => "Anguilla",
  24. "aq" => "Antarctica",
  25. "ag" => "Antigua and Barbuda",
  26. "ar" => "Argentina",
  27. "am" => "Armenia",
  28. "aw" => "Aruba",
  29. "au" => "Australia",
  30. "at" => "Austria",
  31. "az" => "Azerbaijan",
  32. "bs" => "Bahamas",
  33. "bh" => "Bahrain",
  34. "bd" => "Bangladesh",
  35. "bb" => "Barbados",
  36. "by" => "Belarus",
  37. "be" => "Belgium",
  38. "bz" => "Belize",
  39. "bj" => "Benin",
  40. "bm" => "Bermuda",
  41. "bt" => "Bhutan",
  42. "bo" => "Bolivia",
  43. "ba" => "Bosnia and Herzegovina",
  44. "bw" => "Botswana",
  45. "bv" => "Bouvet Island",
  46. "br" => "Brazil",
  47. "io" => "British Indian Ocean Territory",
  48. "bn" => "Brunei Darussalam",
  49. "bg" => "Bulgaria",
  50. "bf" => "Burkina Faso",
  51. "bi" => "Burundi",
  52. "kh" => "Cambodia",
  53. "cm" => "Cameroon",
  54. "ca" => "Canada",
  55. "cv" => "Cape Verde",
  56. "ky" => "Cayman Islands",
  57. "cf" => "Central African Republic",
  58. "td" => "Chad",
  59. "cl" => "Chile",
  60. "cn" => "China",
  61. "cx" => "Christmas Island",
  62. "cc" => "Cocos (Keeling) Islands",
  63. "co" => "Colombia",
  64. "km" => "Comoros",
  65. "cg" => "Congo",
  66. "cd" => "Congo, the Democratic Republic",
  67. "ck" => "Cook Islands",
  68. "cr" => "Costa Rica",
  69. "ci" => "Cote D'ivoire",
  70. "hr" => "Croatia",
  71. "cu" => "Cuba",
  72. "cy" => "Cyprus",
  73. "cz" => "Czech Republic",
  74. "dk" => "Denmark",
  75. "dj" => "Djibouti",
  76. "dm" => "Dominica",
  77. "do" => "Dominican Republic",
  78. "ec" => "Ecuador",
  79. "eg" => "Egypt",
  80. "sv" => "El Salvador",
  81. "gq" => "Equatorial Guinea",
  82. "er" => "Eritrea",
  83. "ee" => "Estonia",
  84. "et" => "Ethiopia",
  85. "fk" => "Falkland Islands (Malvinas)",
  86. "fo" => "Faroe Islands",
  87. "fj" => "Fiji",
  88. "fi" => "Finland",
  89. "fr" => "France",
  90. "gf" => "French Guiana",
  91. "pf" => "French Polynesia",
  92. "tf" => "French Southern Territories",
  93. "ga" => "Gabon",
  94. "gm" => "Gambia",
  95. "ge" => "Georgia",
  96. "de" => "Germany",
  97. "gh" => "Ghana",
  98. "gi" => "Gibraltar",
  99. "gr" => "Greece",
  100. "gl" => "Greenland",
  101. "gd" => "Grenada",
  102. "gp" => "Guadeloupe",
  103. "gu" => "Guam",
  104. "gt" => "Guatemala",
  105. "gn" => "Guinea",
  106. "gw" => "Guinea-Bissau",
  107. "gy" => "Guyana",
  108. "ht" => "Haiti",
  109. "hm" => "Heard Island and Mcdonald Islands",
  110. "va" => "Holy See (Vatican City State)",
  111. "hn" => "Honduras",
  112. "hk" => "Hong Kong",
  113. "hu" => "Hungary",
  114. "is" => "Iceland",
  115. "in" => "India",
  116. "id" => "Indonesia",
  117. "ir" => "Iran, Islamic Republic",
  118. "iq" => "Iraq",
  119. "ie" => "Ireland",
  120. "il" => "Israel",
  121. "it" => "Italy",
  122. "jm" => "Jamaica",
  123. "jp" => "Japan",
  124. "jo" => "Jordan",
  125. "kz" => "Kazakhstan",
  126. "ke" => "Kenya",
  127. "ki" => "Kiribati",
  128. "kp" => "Korea, Democratic People's Republic",
  129. "kr" => "Korea, Republic",
  130. "kw" => "Kuwait",
  131. "kg" => "Kyrgyzstan",
  132. "la" => "Lao People's Democratic Republic",
  133. "lv" => "Latvia",
  134. "lb" => "Lebanon",
  135. "ls" => "Lesotho",
  136. "lr" => "Liberia",
  137. "ly" => "Libyan Arab Jamahiriya",
  138. "li" => "Liechtenstein",
  139. "lt" => "Lithuania",
  140. "lu" => "Luxembourg",
  141. "mo" => "Macao",
  142. "mk" => "Macedonia, the Former Yugosalv Republic",
  143. "mg" => "Madagascar",
  144. "mw" => "Malawi",
  145. "my" => "Malaysia",
  146. "mv" => "Maldives",
  147. "ml" => "Mali",
  148. "mt" => "Malta",
  149. "mh" => "Marshall Islands",
  150. "mq" => "Martinique",
  151. "mr" => "Mauritania",
  152. "mu" => "Mauritius",
  153. "yt" => "Mayotte",
  154. "mx" => "Mexico",
  155. "fm" => "Micronesia, Federated States",
  156. "md" => "Moldova, Republic",
  157. "mc" => "Monaco",
  158. "mn" => "Mongolia",
  159. "ms" => "Montserrat",
  160. "ma" => "Morocco",
  161. "mz" => "Mozambique",
  162. "mm" => "Myanmar",
  163. "na" => "Namibia",
  164. "nr" => "Nauru",
  165. "np" => "Nepal",
  166. "nl" => "Netherlands",
  167. "an" => "Netherlands Antilles",
  168. "nc" => "New Caledonia",
  169. "nz" => "New Zealand",
  170. "ni" => "Nicaragua",
  171. "ne" => "Niger",
  172. "ng" => "Nigeria",
  173. "nu" => "Niue",
  174. "nf" => "Norfolk Island",
  175. "mp" => "Northern Mariana Islands",
  176. "no" => "Norway",
  177. "om" => "Oman",
  178. "pk" => "Pakistan",
  179. "pw" => "Palau",
  180. "ps" => "Palestinian Territory, Occupied",
  181. "pa" => "Panama",
  182. "pg" => "Papua New Guinea",
  183. "py" => "Paraguay",
  184. "pe" => "Peru",
  185. "ph" => "Philippines",
  186. "pn" => "Pitcairn",
  187. "pl" => "Poland",
  188. "pt" => "Portugal",
  189. "pr" => "Puerto Rico",
  190. "qa" => "Qatar",
  191. "re" => "Reunion",
  192. "ro" => "Romania",
  193. "ru" => "Russian Federation",
  194. "rw" => "Rwanda",
  195. "sh" => "Saint Helena",
  196. "kn" => "Saint Kitts and Nevis",
  197. "lc" => "Saint Lucia",
  198. "pm" => "Saint Pierre and Miquelon",
  199. "vc" => "Saint Vincent and the Grenadines",
  200. "ws" => "Samoa",
  201. "sm" => "San Marino",
  202. "st" => "Sao Tome and Principe",
  203. "sa" => "Saudi Arabia",
  204. "sn" => "Senegal",
  205. "cs" => "Serbia and Montenegro",
  206. "sc" => "Seychelles",
  207. "sl" => "Sierra Leone",
  208. "sg" => "Singapore",
  209. "sk" => "Slovakia",
  210. "si" => "Slovenia",
  211. "sb" => "Solomon Islands",
  212. "so" => "Somalia",
  213. "za" => "South Africa",
  214. "gs" => "South Georgia and the South Sandwich Islands",
  215. "es" => "Spain",
  216. "lk" => "Sri Lanka",
  217. "sd" => "Sudan",
  218. "sr" => "Suriname",
  219. "sj" => "Svalbard and Jan Mayen",
  220. "sz" => "Swaziland",
  221. "se" => "Sweden",
  222. "ch" => "Switzerland",
  223. "sy" => "Syrian Arab Republic",
  224. "tw" => "Taiwan, Province of China",
  225. "tj" => "Tajikistan",
  226. "tz" => "Tanzania, United Republic",
  227. "th" => "Thailand",
  228. "tl" => "Timor-Leste",
  229. "tg" => "Togo",
  230. "tk" => "Tokelau",
  231. "to" => "Tonga",
  232. "tt" => "Trinidad and Tobago",
  233. "tn" => "Tunisia",
  234. "tr" => "Turkey",
  235. "tm" => "Turkmenistan",
  236. "tc" => "Turks and Caicos Islands",
  237. "tv" => "Tuvalu",
  238. "ug" => "Uganda",
  239. "ua" => "Ukraine",
  240. "ae" => "United Arab Emirates",
  241. "uk" => "United Kingdom",
  242. "us" => "United States",
  243. "um" => "United States Minor Outlying Islands",
  244. "uy" => "Uruguay",
  245. "uz" => "Uzbekistan",
  246. "vu" => "Vanuatu",
  247. "ve" => "Venezuela",
  248. "vn" => "Viet Nam",
  249. "vg" => "Virgin Islands, British",
  250. "vi" => "Virgin Islands, U.S.",
  251. "wf" => "Wallis and Futuna",
  252. "eh" => "Western Sahara",
  253. "ye" => "Yemen",
  254. "zm" => "Zambia",
  255. "zw" => "Zimbabwe"
  256. ]
  257. ],
  258. "nsfw" => [
  259. "display" => "NSFW",
  260. "option" => [
  261. "yes" => "Yes", // safe=active
  262. "no" => "No" // safe=off
  263. ]
  264. ],
  265. "spellcheck" => [
  266. // display undefined
  267. "option" => [
  268. "yes" => "Yes",
  269. "no" => "No"
  270. ]
  271. ]
  272. ];
  273. switch($page){
  274. case "web":
  275. return array_merge(
  276. $base,
  277. [
  278. "lang" => [ // lr=<lang> (prefix lang with "lang_")
  279. "display" => "Language",
  280. "option" => [
  281. "any" => "Any language",
  282. "ar" => "Arabic",
  283. "bg" => "Bulgarian",
  284. "ca" => "Catalan",
  285. "cs" => "Czech",
  286. "da" => "Danish",
  287. "de" => "German",
  288. "el" => "Greek",
  289. "en" => "English",
  290. "es" => "Spanish",
  291. "et" => "Estonian",
  292. "fi" => "Finnish",
  293. "fr" => "French",
  294. "hr" => "Croatian",
  295. "hu" => "Hungarian",
  296. "id" => "Indonesian",
  297. "is" => "Icelandic",
  298. "it" => "Italian",
  299. "iw" => "Hebrew",
  300. "ja" => "Japanese",
  301. "ko" => "Korean",
  302. "lt" => "Lithuanian",
  303. "lv" => "Latvian",
  304. "nl" => "Dutch",
  305. "no" => "Norwegian",
  306. "pl" => "Polish",
  307. "pt" => "Portuguese",
  308. "ro" => "Romanian",
  309. "ru" => "Russian",
  310. "sk" => "Slovak",
  311. "sl" => "Slovenian",
  312. "sr" => "Serbian",
  313. "sv" => "Swedish",
  314. "tr" => "Turkish",
  315. "zh-CN" => "Chinese (Simplified)",
  316. "zh-TW" => "Chinese (Traditional)"
  317. ]
  318. ],
  319. "sort" => [
  320. "display" => "Sort by",
  321. "option" => [
  322. "relevance" => "Relevance",
  323. "date" => "Date"
  324. ]
  325. ],
  326. "redundant" => [
  327. "display" => "Remove redundant",
  328. "option" => [
  329. "yes" => "Yes",
  330. "no" => "No",
  331. ]
  332. ]
  333. ]
  334. );
  335. break;
  336. case "images":
  337. return array_merge(
  338. $base,
  339. [
  340. "size" => [ // imgsz
  341. "display" => "Size",
  342. "option" => [
  343. "any" => "Any size",
  344. "l" => "Large",
  345. "m" => "Medium",
  346. "i" => "Icon",
  347. "qsvga" => "Larger than 400x300",
  348. "vga" => "Larger than 640x480",
  349. "svga" => "Larger than 800x600",
  350. "xga" => "Larger than 1024x768",
  351. "2mp" => "Larger than 2MP",
  352. "4mp" => "Larger than 4MP",
  353. "6mp" => "Larger than 6MP",
  354. "8mp" => "Larger than 8MP",
  355. "10mp" => "Larger than 10MP",
  356. "12mp" => "Larger than 12MP",
  357. "15mp" => "Larger than 15MP",
  358. "20mp" => "Larger than 20MP",
  359. "40mp" => "Larger than 40MP",
  360. "70mp" => "Larger than 70MP"
  361. ]
  362. ],
  363. "color" => [ // imgc
  364. "display" => "Color",
  365. "option" => [
  366. "any" => "Any color",
  367. "color" => "Full color",
  368. "bnw" => "Black & white",
  369. "trans" => "Transparent",
  370. // from here, imgcolor
  371. "red" => "Red",
  372. "orange" => "Orange",
  373. "yellow" => "Yellow",
  374. "green" => "Green",
  375. "teal" => "Teal",
  376. "blue" => "Blue",
  377. "purple" => "Purple",
  378. "pink" => "Pink",
  379. "white" => "White",
  380. "gray" => "Gray",
  381. "black" => "Black",
  382. "brown" => "Brown"
  383. ]
  384. ],
  385. "format" => [ // as_filetype
  386. "display" => "Format",
  387. "option" => [
  388. "any" => "Any format",
  389. "jpg" => "JPG",
  390. "gif" => "GIF",
  391. "png" => "PNG",
  392. "bmp" => "BMP",
  393. "svg" => "SVG",
  394. "webp" => "WEBP",
  395. "ico" => "ICO",
  396. "craw" => "RAW"
  397. ]
  398. ]
  399. ]
  400. );
  401. break;
  402. }
  403. }
  404. private function get($proxy, $url, $get = [], $reqtype = self::req_js){
  405. $curlproc = curl_init();
  406. if($get !== []){
  407. $get = http_build_query($get);
  408. $url .= "?" . $get;
  409. }
  410. curl_setopt($curlproc, CURLOPT_URL, $url);
  411. // http2 bypass
  412. curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
  413. curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
  414. if($reqtype === self::req_js){
  415. curl_setopt($curlproc, CURLOPT_HTTPHEADER,
  416. ["User-Agent: " . config::USER_AGENT,
  417. "Accept: */*",
  418. "Accept-Language: en-US,en;q=0.5",
  419. "Accept-Encoding: gzip",
  420. "DNT: 1",
  421. "Sec-GPC: 1",
  422. "Alt-Used: cse.google.com",
  423. "Connection: keep-alive",
  424. "Referer: https://cse.google.com/cse?cx=" . config::GOOGLE_CX_ENDPOINT,
  425. "Sec-Fetch-Dest: script",
  426. "Sec-Fetch-Mode: no-cors",
  427. "Sec-Fetch-Site: same-origin",
  428. "TE: trailers"]
  429. );
  430. }else{
  431. curl_setopt($curlproc, CURLOPT_HTTPHEADER,
  432. ["User-Agent: " . config::USER_AGENT,
  433. "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8",
  434. "Accept-Language: en-US,en;q=0.5",
  435. "Accept-Encoding: gzip",
  436. "DNT: 1",
  437. "Sec-GPC: 1",
  438. "Connection: keep-alive",
  439. "Upgrade-Insecure-Requests: 1",
  440. "Sec-Fetch-Dest: document",
  441. "Sec-Fetch-Mode: navigate",
  442. "Sec-Fetch-Site: none",
  443. "Sec-Fetch-User: ?1",
  444. "Priority: u=0, i"]
  445. );
  446. }
  447. curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
  448. curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
  449. curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
  450. curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
  451. curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
  452. $this->backend->assign_proxy($curlproc, $proxy);
  453. $data = curl_exec($curlproc);
  454. if(curl_errno($curlproc)){
  455. throw new Exception(curl_error($curlproc));
  456. }
  457. curl_close($curlproc);
  458. return $data;
  459. }
  460. public function web($get){
  461. // page 1
  462. // https://cse.google.com/cse/element/v1?rsz=filtered_cse&num=10&hl=en&source=gcsc&cselibv=8fa85d58e016b414&cx=d4e68b99b876541f0&q=asmr&safe=active&cse_tok=AB-tC_6RPUTmB4XK0lE9e1AFFC5r%3A1729563832926&lr=&cr=&gl=&filter=0&sort=&as_oq=&as_sitesearch=&exp=cc%2Capo&oq=asmr&gs_l=partner-web.3..0i512i433j0i512i433i131l2j0i512i433j0i512i433i131j0i512i433j0i512i433i131l2j0i512l2.10902.266627.5.267157.11.10.0.0.0.0.188.1108.2j7.9.0.csems%2Cnrl%3D10...0....1.34.partner-web..42.14.1500.WJQvMvfXkx4&cseclient=hosted-page-client&callback=google.search.cse.api8223&rurl=https%3A%2F%2Fcse.google.com%2Fcse%3Fcx%3Dd4e68b99b876541f0%23gsc.tab%3D0%26gsc.q%3Dtest%26gsc.sort%3D
  463. // page 2
  464. // https://cse.google.com/cse/element/v1?rsz=filtered_cse&num=10&hl=en&source=gcsc&start=10&cselibv=8fa85d58e016b414&cx=d4e68b99b876541f0&q=asmr&safe=active&cse_tok=AB-tC_6RPUTmB4XK0lE9e1AFFC5r%3A1729563832926&lr=&cr=&gl=&filter=0&sort=&as_oq=&as_sitesearch=&exp=cc%2Capo&callback=google.search.cse.api3595&rurl=https%3A%2F%2Fcse.google.com%2Fcse%3Fcx%3Dd4e68b99b876541f0%23gsc.tab%3D0%26gsc.q%3Dtest%26gsc.sort%3D
  465. if($get["npt"]){
  466. [$req_params, $proxy] =
  467. $this->backend->get(
  468. $get["npt"],
  469. "web"
  470. );
  471. $req_params =
  472. json_decode(
  473. $req_params,
  474. true
  475. );
  476. $json =
  477. $this->get(
  478. $proxy,
  479. "https://cse.google.com/cse/element/v1",
  480. $req_params,
  481. self::req_js
  482. );
  483. }else{
  484. $proxy = $this->backend->get_ip();
  485. $params = $this->generate_token($proxy);
  486. //$json = file_get_contents("scraper/google_cse.txt");
  487. $req_params = [
  488. "rsz" => "filtered_cse",
  489. "num" => 20,
  490. "hl" => "en",
  491. "source" => "gcsc",
  492. "cselibv" => $params["lib"],
  493. "cx" => config::GOOGLE_CX_ENDPOINT,
  494. "q" => $get["s"],
  495. "safe" => $get["nsfw"] == "yes" ? "off" : "active",
  496. "cse_tok" => $params["token"],
  497. "lr" => $get["lang"] == "any" ? "" : "lang_" . $get["lang"],
  498. "cr" => $get["country"] == "any" ? "" : "country" . strtoupper($get["country"]),
  499. "gl" => "",
  500. "filter" => $get["redundant"] == "yes" ? "1" : "0",
  501. "sort" => $get["sort"] == "relevance" ? "" : "date",
  502. "as_oq" => "",
  503. "as_sitesearch" => "",
  504. "exp" => "cc,apo",
  505. "oq" => $get["s"],
  506. "gs_l" => "partner-web.3...33294.34225.3.34597.26.11.0.0.0.0.201.1132.6j4j1.11.0.csems,nrl=10...0....1.34.partner-web..34.19.1897.FKEeG5yh2iw",
  507. "cseclient" => "hosted-page-client",
  508. "callback" => "google.search.cse.api" . random_int(4000, 99999),
  509. "rurl" => "https://cse.google.com/cse?cx=" . config::GOOGLE_CX_ENDPOINT . "#gsc.tab=0&gsc.q=" . $get["s"] . "&gsc.sort="
  510. ];
  511. if($get["spellcheck"] == "no"){
  512. $req_params["nfpr"] = "1";
  513. }
  514. $json =
  515. $this->get(
  516. $proxy,
  517. "https://cse.google.com/cse/element/v1",
  518. $req_params,
  519. self::req_js
  520. );
  521. unset($req_params["gs_l"]);
  522. $req_params["start"] = 0;
  523. }
  524. $req_params["start"] += 20;
  525. if(
  526. !preg_match(
  527. '/google\.search\.cse\.[A-Za-z0-9]+\(([\S\s]*)\);/i',
  528. $json,
  529. $json
  530. )
  531. ){
  532. throw new Exception("Failed to grep JSON");
  533. }
  534. $json = json_decode($json[1], true);
  535. if(isset($json["error"])){
  536. if(isset($json["error"]["errors"][0]["message"])){
  537. throw new Exception("Google returned an error: " . $json["error"]["errors"][0]["message"]);
  538. }
  539. if(isset($json["error"]["message"])){
  540. throw new Exception("Google returned an error: " . $json["error"]["message"]);
  541. }
  542. throw new Exception("Google returned an error object");
  543. }
  544. $out = [
  545. "status" => "ok",
  546. "spelling" => [
  547. "type" => "no_correction",
  548. "using" => null,
  549. "correction" => null
  550. ],
  551. "npt" => null,
  552. "answer" => [],
  553. "web" => [],
  554. "image" => [],
  555. "video" => [],
  556. "news" => [],
  557. "related" => []
  558. ];
  559. // detect word correction
  560. if(isset($json["spelling"]["type"])){
  561. switch($json["spelling"]["type"]){
  562. case "DYM": // did you mean? @TODO fix wording
  563. $type = "including";
  564. break;
  565. case "SPELL_CORRECTED_RESULTS": // not many results for
  566. $type = "not_many";
  567. break;
  568. default:
  569. $type = "not_many";
  570. }
  571. if(isset($json["spelling"]["originalQuery"])){
  572. $using = $json["spelling"]["originalQuery"];
  573. }
  574. elseif(isset($json["spelling"]["anchor"])){
  575. $using = html_entity_decode(strip_tags($json["spelling"]["anchor"]));
  576. }elseif(isset($json["spelling"]["originalAnchor"])){
  577. $using = html_entity_decode(strip_tags($json["spelling"]["originalAnchor"]));
  578. }
  579. $out["spelling"] = [
  580. "type" => $type,
  581. "using" => $using,
  582. "correction" => $json["spelling"]["correctedQuery"]
  583. ];
  584. }
  585. if(!isset($json["results"])){
  586. return $out;
  587. }
  588. foreach($json["results"] as $result){
  589. // get date from description
  590. $description =
  591. explode(
  592. "...",
  593. trim($result["contentNoFormatting"], " ."),
  594. 2
  595. );
  596. if(count($description) === 2){
  597. if($date = strtotime($description[0])){
  598. $description = ltrim($description[1]);
  599. }else{
  600. $date = null;
  601. $description = implode("...", $description);
  602. }
  603. }else{
  604. $description = implode("...", $description);
  605. $date = null;
  606. }
  607. $description = trim($description, " .");
  608. // get thumbnails
  609. if(isset($result["richSnippet"]["cseThumbnail"]["src"])){
  610. $thumb = [
  611. "url" => $this->unshit_thumb($result["richSnippet"]["cseThumbnail"]["src"]),
  612. "ratio" => "1:1"
  613. ];
  614. }
  615. elseif(isset($result["richSnippet"]["cseImage"]["src"])){
  616. $thumb = [
  617. "url" => $result["richSnippet"]["cseImage"]["src"],
  618. "ratio" => "1:1"
  619. ];
  620. }else{
  621. $thumb = [
  622. "url" => null,
  623. "ratio" => null
  624. ];
  625. }
  626. if($thumb["url"] !== null){
  627. $found_size = false;
  628. // find correct ratio
  629. if(
  630. isset($result["richSnippet"]["cseThumbnail"]["width"]) &&
  631. isset($result["richSnippet"]["cseThumbnail"]["height"])
  632. ){
  633. $found_size = true;
  634. $width = (int)$result["richSnippet"]["cseThumbnail"]["width"];
  635. $height = (int)$result["richSnippet"]["cseThumbnail"]["height"];
  636. }
  637. elseif(
  638. isset($result["richSnippet"]["metatags"]["ogImageWidth"]) &&
  639. isset($result["richSnippet"]["metatags"]["ogImageHeight"])
  640. ){
  641. $found_size = true;
  642. $width = (int)$result["richSnippet"]["metatags"]["ogImageWidth"];
  643. $height = (int)$result["richSnippet"]["metatags"]["ogImageHeight"];
  644. }
  645. // calculate rounded ratio
  646. if($found_size){
  647. $aspect_ratio = $width / $height;
  648. if($aspect_ratio >= 1.5){
  649. $thumb["ratio"] = "16:9";
  650. }
  651. elseif($aspect_ratio >= 0.8){
  652. $thumb["ratio"] = "1:1";
  653. }else{
  654. $thumb["ratio"] = "9:16";
  655. }
  656. }
  657. }
  658. $out["web"][] = [
  659. "title" => rtrim($result["titleNoFormatting"], " ."),
  660. "description" => $description,
  661. "url" => $result["unescapedUrl"],
  662. "date" => $date,
  663. "type" => "web",
  664. "thumb" => $thumb,
  665. "sublink" => [],
  666. "table" => []
  667. ];
  668. }
  669. // detect next page
  670. if(
  671. isset($json["cursor"]["isExactTotalResults"]) || // detects last page
  672. !isset($json["cursor"]["pages"]) // detects no results on page
  673. ){
  674. return $out;
  675. }
  676. // get next page
  677. $out["npt"] =
  678. $this->backend->store(
  679. json_encode(
  680. $req_params
  681. ),
  682. "web",
  683. $proxy
  684. );
  685. return $out;
  686. }
  687. public function image($get){
  688. if($get["npt"]){
  689. [$req_params, $proxy] =
  690. $this->backend->get(
  691. $get["npt"],
  692. "images"
  693. );
  694. $req_params =
  695. json_decode(
  696. $req_params,
  697. true
  698. );
  699. $json =
  700. $this->get(
  701. $proxy,
  702. "https://cse.google.com/cse/element/v1",
  703. $req_params,
  704. self::req_js
  705. );
  706. }else{
  707. $proxy = $this->backend->get_ip();
  708. $params = $this->generate_token($proxy);
  709. //$json = file_get_contents("scraper/google_cse.txt");
  710. $req_params = [
  711. "rsz" => "filtered_cse",
  712. "num" => 20,
  713. "hl" => "en",
  714. "source" => "gcsc",
  715. "cselibv" => $params["lib"],
  716. "searchtype" => "image",
  717. "cx" => config::GOOGLE_CX_ENDPOINT,
  718. "q" => $get["s"],
  719. "safe" => $get["nsfw"] == "yes" ? "off" : "active",
  720. "cse_tok" => $params["token"],
  721. "exp" => "cc,apo",
  722. "cseclient" => "hosted-page-client",
  723. "callback" => "google.search.cse.api" . random_int(4000, 99999),
  724. "rurl" => "https://cse.google.com/cse?cx=" . config::GOOGLE_CX_ENDPOINT . "#gsc.tab=1&gsc.q=" . $get["s"] . "&gsc.sort="
  725. ];
  726. // add additional hidden filters
  727. // country (image search uses cr instead of gl)
  728. if($get["country"] != "any"){
  729. $req_params["cr"] = "country" . strtoupper($get["country"]);
  730. }
  731. // nsfw
  732. $req_params["safe"] = $get["nsfw"] == "yes" ? "off" : "active";
  733. // size
  734. if($get["size"] != "any"){
  735. $req_params["imgsz"] = $get["size"];
  736. }
  737. // format
  738. if($get["format"] != "any"){
  739. $req_params["as_filetype"] = $get["format"];
  740. }
  741. // color
  742. if($get["color"] != "any"){
  743. if(
  744. $get["color"] == "color" ||
  745. $get["color"] == "trans"
  746. ){
  747. $req_params["imgc"] = $get["color"];
  748. }elseif($get["color"] == "bnw"){
  749. $req_params["imgc"] = "gray";
  750. }else{
  751. $req_params["imgcolor"] = $get["color"];
  752. }
  753. }
  754. $json =
  755. $this->get(
  756. $proxy,
  757. "https://cse.google.com/cse/element/v1",
  758. $req_params,
  759. self::req_js
  760. );
  761. $req_params["start"] = 0;
  762. }
  763. $req_params["start"] += 20;
  764. if(
  765. !preg_match(
  766. '/google\.search\.cse\.[A-Za-z0-9]+\(([\S\s]*)\);/i',
  767. $json,
  768. $json
  769. )
  770. ){
  771. throw new Exception("Failed to grep JSON");
  772. }
  773. $json = json_decode($json[1], true);
  774. if(isset($json["error"])){
  775. if(isset($json["error"]["errors"][0]["message"])){
  776. throw new Exception("Google returned an error: " . $json["error"]["errors"][0]["message"]);
  777. }
  778. if(isset($json["error"]["message"])){
  779. throw new Exception("Google returned an error: " . $json["error"]["message"]);
  780. }
  781. throw new Exception("Google returned an error object");
  782. }
  783. $out = [
  784. "status" => "ok",
  785. "npt" => null,
  786. "image" => []
  787. ];
  788. // detect next page
  789. if(
  790. isset($json["cursor"]["isExactTotalResults"]) || // detects last page
  791. !isset($json["cursor"]["pages"]) // detects no results on page
  792. ){
  793. return $out;
  794. }
  795. foreach($json["results"] as $result){
  796. $out["image"][] = [
  797. "title" => rtrim($result["titleNoFormatting"], " ."),
  798. "source" => [
  799. [
  800. "url" => $result["unescapedUrl"],
  801. "width" => (int)$result["width"],
  802. "height" => (int)$result["height"]
  803. ],
  804. [
  805. "url" => $result["tbLargeUrl"],
  806. "width" => (int)$result["tbLargeWidth"],
  807. "height" => (int)$result["tbLargeHeight"]
  808. ]
  809. ],
  810. "url" => $result["originalContextUrl"]
  811. ];
  812. }
  813. // get next page
  814. $out["npt"] =
  815. $this->backend->store(
  816. json_encode(
  817. $req_params
  818. ),
  819. "images",
  820. $proxy
  821. );
  822. return $out;
  823. }
  824. private function generate_token($proxy){
  825. $html =
  826. $this->get(
  827. $proxy,
  828. "https://cse.google.com/cse",
  829. [
  830. "cx" => config::GOOGLE_CX_ENDPOINT
  831. ],
  832. self::req_html
  833. );
  834. // detect captcha
  835. $this->fuckhtml->load($html);
  836. $title =
  837. $this->fuckhtml
  838. ->getElementsByTagName(
  839. "title"
  840. );
  841. if(
  842. count($title) !== 0 &&
  843. $title[0]["innerHTML"] == "302 Moved"
  844. ){
  845. throw new Exception("Google returned a captcha");
  846. }
  847. // get token
  848. preg_match(
  849. '/relativeUrl=\'([^\']+)\';/i',
  850. $html,
  851. $js_uri
  852. );
  853. if(!isset($js_uri[1])){
  854. throw new Exception("Failed to grep search token");
  855. }
  856. $js_uri =
  857. $this->fuckhtml
  858. ->parseJsString(
  859. $js_uri[1]
  860. );
  861. // get parameters
  862. $js =
  863. $this->get(
  864. $proxy,
  865. "https://cse.google.com" . $js_uri,
  866. [],
  867. self::req_js
  868. );
  869. preg_match(
  870. '/}\)\(({[\S\s]+})\);/',
  871. $js,
  872. $json
  873. );
  874. if(!isset($json[1])){
  875. throw new Exception("Failed to grep JSON parameters");
  876. }
  877. $json = json_decode($json[1], true);
  878. return [
  879. "token" => $json["cse_token"],
  880. "lib" => $json["cselibVersion"]
  881. ];
  882. }
  883. private function unshit_thumb($url){
  884. // https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQINE2vbnNLHXqoZr3RVsaEJFyOsj1_BiBnJch-e1nyz3oia7Aj5xVj
  885. // https://i.ytimg.com/vi/PZVIyA5ER3Y/mqdefault.jpg?sqp=-oaymwEFCJQBEFM&rs=AMzJL3nXeaCpdIar-ltNwl82Y82cIJfphA
  886. $parts = parse_url($url);
  887. if(
  888. isset($parts["host"]) &&
  889. preg_match(
  890. '/tbn.*\.gstatic\.com/',
  891. $parts["host"]
  892. )
  893. ){
  894. parse_str($parts["query"], $params);
  895. if(isset($params["q"])){
  896. return "https://" . $parts["host"] . "/images?q=" . $params["q"];
  897. }
  898. }
  899. return $url;
  900. }
  901. }