greppr.php 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435
  1. <?php
  2. class greppr{
  3. public function __construct(){
  4. include "lib/backend.php";
  5. $this->backend = new backend("greppr");
  6. include "lib/fuckhtml.php";
  7. $this->fuckhtml = new fuckhtml();
  8. }
  9. public function getfilters($page){
  10. return [];
  11. }
  12. private function get($proxy, $url, $get = [], $cookie = false){
  13. $curlproc = curl_init();
  14. if($get !== []){
  15. $get = http_build_query($get);
  16. $url .= "?" . $get;
  17. }
  18. curl_setopt($curlproc, CURLOPT_URL, $url);
  19. curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
  20. if($cookie === false){
  21. curl_setopt($curlproc, CURLOPT_HTTPHEADER,
  22. ["User-Agent: " . config::USER_AGENT,
  23. "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
  24. "Accept-Language: en-US,en;q=0.5",
  25. "Accept-Encoding: gzip",
  26. "DNT: 1",
  27. "Connection: keep-alive",
  28. "Upgrade-Insecure-Requests: 1",
  29. "Sec-Fetch-Dest: document",
  30. "Sec-Fetch-Mode: navigate",
  31. "Sec-Fetch-Site: none",
  32. "Sec-Fetch-User: ?1"]
  33. );
  34. }else{
  35. curl_setopt($curlproc, CURLOPT_HTTPHEADER,
  36. ["User-Agent: " . config::USER_AGENT,
  37. "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
  38. "Accept-Language: en-US,en;q=0.5",
  39. "Accept-Encoding: gzip",
  40. "Cookie: PHPSESSID=" . $cookie,
  41. "DNT: 1",
  42. "Connection: keep-alive",
  43. "Upgrade-Insecure-Requests: 1",
  44. "Sec-Fetch-Dest: document",
  45. "Sec-Fetch-Mode: navigate",
  46. "Sec-Fetch-Site: none",
  47. "Sec-Fetch-User: ?1"]
  48. );
  49. }
  50. curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
  51. curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
  52. curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
  53. curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
  54. curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
  55. $this->backend->assign_proxy($curlproc, $proxy);
  56. $headers = [];
  57. curl_setopt(
  58. $curlproc,
  59. CURLOPT_HEADERFUNCTION,
  60. function($curlproc, $header) use (&$headers){
  61. $len = strlen($header);
  62. $header = explode(':', $header, 2);
  63. if(count($header) < 2){
  64. // ignore invalid headers
  65. return $len;
  66. }
  67. $headers[strtolower(trim($header[0]))] = trim($header[1]);
  68. return $len;
  69. }
  70. );
  71. $data = curl_exec($curlproc);
  72. if(curl_errno($curlproc)){
  73. throw new Exception(curl_error($curlproc));
  74. }
  75. curl_close($curlproc);
  76. return [
  77. "headers" => $headers,
  78. "data" => $data
  79. ];
  80. }
  81. public function web($get, $first_attempt = true){
  82. if($get["npt"]){
  83. [$q, $proxy] = $this->backend->get($get["npt"], "web");
  84. $q = json_decode($q, true);
  85. }else{
  86. $search = $get["s"];
  87. if(strlen($search) === 0){
  88. throw new Exception("Search term is empty!");
  89. }
  90. $proxy = $this->backend->get_ip();
  91. }
  92. // get token
  93. // token[0] = static token that changes once a day
  94. // token[1] = dynamic token that changes on every request
  95. // token[1] = PHPSESSID cookie
  96. $tokens = apcu_fetch("greppr_token");
  97. if(
  98. $tokens === false ||
  99. $first_attempt === false // force token fetch
  100. ){
  101. // we haven't gotten the token yet, get it
  102. try{
  103. $response =
  104. $this->get(
  105. $proxy,
  106. "https://greppr.org",
  107. []
  108. );
  109. }catch(Exception $error){
  110. throw new Exception("Failed to fetch search tokens");
  111. }
  112. $tokens = $this->parse_token($response);
  113. if($tokens === false){
  114. throw new Exception("Failed to grep search tokens");
  115. }
  116. }
  117. try{
  118. if($get["npt"]){
  119. $params = [
  120. $tokens[0] => $q["q"],
  121. "s" => $q["s"],
  122. "l" => 30,
  123. "n" => $tokens[1]
  124. ];
  125. }else{
  126. $params = [
  127. $tokens[0] => $search,
  128. "n" => $tokens[1]
  129. ];
  130. }
  131. $searchresults = $this->get(
  132. $proxy,
  133. "https://greppr.org/search",
  134. $params,
  135. $tokens[2]
  136. );
  137. }catch(Exception $error){
  138. throw new Exception("Failed to fetch search page");
  139. }
  140. if(strlen($searchresults["data"]) === 0){
  141. // redirected to main page, which means we got old token
  142. // generate a new one
  143. // ... unless we just tried to do that
  144. if($first_attempt === false){
  145. throw new Exception("Failed to get a new search token");
  146. }
  147. return $this->web($get, false);
  148. }
  149. // refresh the token with new data (this also triggers fuckhtml load)
  150. $this->parse_token($searchresults, $tokens[2]);
  151. // response object
  152. $out = [
  153. "status" => "ok",
  154. "spelling" => [
  155. "type" => "no_correction",
  156. "using" => null,
  157. "correction" => null
  158. ],
  159. "npt" => null,
  160. "answer" => [],
  161. "web" => [],
  162. "image" => [],
  163. "video" => [],
  164. "news" => [],
  165. "related" => []
  166. ];
  167. // get results for later
  168. $results =
  169. $this->fuckhtml
  170. ->getElementsByClassName(
  171. "result",
  172. "div"
  173. );
  174. // check for next page
  175. $next_elem =
  176. $this->fuckhtml
  177. ->getElementsByClassName(
  178. "pagination",
  179. "ul"
  180. );
  181. if(count($next_elem) !== 0){
  182. $this->fuckhtml->load($next_elem[0]);
  183. $as =
  184. $this->fuckhtml
  185. ->getElementsByClassName(
  186. "page-link",
  187. "a"
  188. );
  189. $break = false;
  190. foreach($as as $a){
  191. if($break === true){
  192. parse_str(
  193. $this->fuckhtml
  194. ->getTextContent(
  195. $a["attributes"]["href"]
  196. ),
  197. $values
  198. );
  199. $values = array_values($values);
  200. $out["npt"] =
  201. $this->backend->store(
  202. json_encode(
  203. [
  204. "q" => $values[0],
  205. "s" => $values[1]
  206. ]
  207. ),
  208. "web",
  209. $proxy
  210. );
  211. break;
  212. }
  213. if($a["attributes"]["href"] == "#"){
  214. $break = true;
  215. }
  216. }
  217. }
  218. // scrape results
  219. foreach($results as $result){
  220. $this->fuckhtml->load($result);
  221. $a =
  222. $this->fuckhtml
  223. ->getElementsByTagName(
  224. "a"
  225. )[0];
  226. $description =
  227. $this->fuckhtml
  228. ->getElementsByClassName(
  229. "highlightedDesc",
  230. "p"
  231. );
  232. if(count($description) === 0){
  233. $description = null;
  234. }else{
  235. $description =
  236. $this->limitstrlen(
  237. $this->fuckhtml
  238. ->getTextContent(
  239. $description[0]
  240. )
  241. );
  242. }
  243. $date =
  244. $this->fuckhtml
  245. ->getElementsByTagName(
  246. "p"
  247. );
  248. $date =
  249. strtotime(
  250. explode(
  251. ":",
  252. $this->fuckhtml
  253. ->getTextContent(
  254. $date[count($date) - 1]["innerHTML"]
  255. )
  256. )[1]
  257. );
  258. $out["web"][] = [
  259. "title" =>
  260. $this->fuckhtml
  261. ->getTextContent(
  262. $a["innerHTML"]
  263. ),
  264. "description" => $description,
  265. "url" =>
  266. $this->fuckhtml
  267. ->getTextContent(
  268. $a["attributes"]["href"]
  269. ),
  270. "date" => $date,
  271. "type" => "web",
  272. "thumb" => [
  273. "url" => null,
  274. "ratio" => null
  275. ],
  276. "sublink" => [],
  277. "table" => []
  278. ];
  279. }
  280. return $out;
  281. }
  282. private function parse_token($response, $cookie = false){
  283. $this->fuckhtml->load($response["data"]);
  284. $scripts =
  285. $this->fuckhtml
  286. ->getElementsByTagName("script");
  287. $found = false;
  288. foreach($scripts as $script){
  289. preg_match(
  290. '/window\.location ?= ?\'\/search\?([^=]+).*&n=([0-9]+)/',
  291. $script["innerHTML"],
  292. $tokens
  293. );
  294. if(isset($tokens[1])){
  295. $found = true;
  296. break;
  297. }
  298. }
  299. if($found === false){
  300. return false;
  301. }
  302. $tokens = [
  303. $tokens[1],
  304. $tokens[2]
  305. ];
  306. if($cookie !== false){
  307. // we already specified a cookie, so use the one we have already
  308. $tokens[] = $cookie;
  309. apcu_store("greppr_token", $tokens);
  310. return $tokens;
  311. }
  312. if(!isset($response["headers"]["set-cookie"])){
  313. // server didn't send a cookie
  314. return false;
  315. }
  316. // get cookie
  317. preg_match(
  318. '/PHPSESSID=([^;]+)/',
  319. $response["headers"]["set-cookie"],
  320. $cookie
  321. );
  322. if(!isset($cookie[1])){
  323. // server sent an unexpected cookie
  324. return false;
  325. }
  326. $tokens[] = $cookie[1];
  327. apcu_store("greppr_token", $tokens);
  328. return $tokens;
  329. }
  330. private function limitstrlen($text){
  331. return explode("\n", wordwrap($text, 300, "\n"))[0];
  332. }
  333. }