marginalia.php 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476
  1. <?php
  2. class marginalia{
  3. public function __construct(){
  4. include "lib/fuckhtml.php";
  5. $this->fuckhtml = new fuckhtml();
  6. include "lib/backend.php";
  7. $this->backend = new backend("marginalia");
  8. }
  9. public function getfilters($page){
  10. if(config::MARGINALIA_API_KEY === null){
  11. $base = [
  12. "adtech" => [
  13. "display" => "Reduce adtech",
  14. "option" => [
  15. "no" => "No",
  16. "yes" => "Yes"
  17. ]
  18. ],
  19. "recent" => [
  20. "display" => "Recent results",
  21. "option" => [
  22. "no" => "No",
  23. "yes" => "Yes"
  24. ]
  25. ],
  26. "intitle" => [
  27. "display" => "Search in title",
  28. "option" => [
  29. "no" => "No",
  30. "yes" => "Yes"
  31. ]
  32. ]
  33. ];
  34. }else{
  35. $base = [];
  36. }
  37. return array_merge(
  38. $base,
  39. [
  40. "format" => [
  41. "display" => "Format",
  42. "option" => [
  43. "any" => "Any format",
  44. "html5" => "html5",
  45. "xhtml" => "xhtml",
  46. "html123" => "html123"
  47. ]
  48. ],
  49. "file" => [
  50. "display" => "Filetype",
  51. "option" => [
  52. "any" => "Any filetype",
  53. "nomedia" => "Deny media",
  54. "media" => "Contains media",
  55. "audio" => "Contains audio",
  56. "video" => "Contains video",
  57. "archive" => "Contains archive",
  58. "document" => "Contains document"
  59. ]
  60. ],
  61. "javascript" => [
  62. "display" => "Javascript",
  63. "option" => [
  64. "any" => "Allow JS",
  65. "deny" => "Deny JS",
  66. "require" => "Require JS"
  67. ]
  68. ],
  69. "trackers" => [
  70. "display" => "Trackers",
  71. "option" => [
  72. "any" => "Allow trackers",
  73. "deny" => "Deny trackers",
  74. "require" => "Require trackers"
  75. ]
  76. ],
  77. "cookies" => [
  78. "display" => "Cookies",
  79. "option" => [
  80. "any" => "Allow cookies",
  81. "deny" => "Deny cookies",
  82. "require" => "Require cookies"
  83. ]
  84. ],
  85. "affiliate" => [
  86. "display" => "Affiliate links in body",
  87. "option" => [
  88. "any" => "Allow affiliate links",
  89. "deny" => "Deny affiliate links",
  90. "require" => "Require affiliate links"
  91. ]
  92. ]
  93. ]
  94. );
  95. }
  96. private function get($proxy, $url, $get = []){
  97. $headers = [
  98. "User-Agent: " . config::USER_AGENT,
  99. "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
  100. "Accept-Language: en-US,en;q=0.5",
  101. "Accept-Encoding: gzip",
  102. "DNT: 1",
  103. "Connection: keep-alive",
  104. "Upgrade-Insecure-Requests: 1",
  105. "Sec-Fetch-Dest: document",
  106. "Sec-Fetch-Mode: navigate",
  107. "Sec-Fetch-Site: none",
  108. "Sec-Fetch-User: ?1"
  109. ];
  110. $curlproc = curl_init();
  111. if($get !== []){
  112. $get = http_build_query($get);
  113. $url .= "?" . $get;
  114. }
  115. curl_setopt($curlproc, CURLOPT_URL, $url);
  116. curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
  117. curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers);
  118. curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
  119. curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
  120. curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
  121. curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
  122. curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
  123. $this->backend->assign_proxy($curlproc, $proxy);
  124. $data = curl_exec($curlproc);
  125. if(curl_errno($curlproc)){
  126. throw new Exception(curl_error($curlproc));
  127. }
  128. curl_close($curlproc);
  129. return $data;
  130. }
  131. public function web($get){
  132. $search = [$get["s"]];
  133. if(strlen($get["s"]) === 0){
  134. throw new Exception("Search term is empty!");
  135. }
  136. $format = $get["format"];
  137. $file = $get["file"];
  138. foreach(
  139. [
  140. "javascript" => $get["javascript"],
  141. "trackers" => $get["trackers"],
  142. "cookies" => $get["cookies"],
  143. "affiliate" => $get["affiliate"]
  144. ]
  145. as $key => $value
  146. ){
  147. if($value == "any"){ continue; }
  148. switch($key){
  149. case "javascript": $str = "js:true"; break;
  150. case "trackers": $str = "special:tracking"; break;
  151. case "cookies": $str = "special:cookies"; break;
  152. case "affiliate": $str = "special:affiliate"; break;
  153. }
  154. if($value == "deny"){
  155. $str = "-" . $str;
  156. }
  157. $search[] = $str;
  158. }
  159. if($format != "any"){
  160. $search[] = "format:$format";
  161. }
  162. switch($file){
  163. case "any": break;
  164. case "nomedia": $search[] = "-special:media"; break;
  165. case "media": $search[] = "special:media"; break;
  166. default:
  167. $search[] = "file:$file";
  168. }
  169. $search = implode(" ", $search);
  170. $out = [
  171. "status" => "ok",
  172. "spelling" => [
  173. "type" => "no_correction",
  174. "using" => null,
  175. "correction" => null
  176. ],
  177. "npt" => null,
  178. "answer" => [],
  179. "web" => [],
  180. "image" => [],
  181. "video" => [],
  182. "news" => [],
  183. "related" => []
  184. ];
  185. // API scraper
  186. if(config::MARGINALIA_API_KEY !== null){
  187. try{
  188. $json =
  189. $this->get(
  190. $this->backend->get_ip(), // no nextpage
  191. "https://api.marginalia.nu/" . config::MARGINALIA_API_KEY . "/search/" . urlencode($search),
  192. [
  193. "count" => 20
  194. ]
  195. );
  196. }catch(Exception $error){
  197. throw new Exception("Failed to get JSON");
  198. }
  199. if($json == "Slow down"){
  200. throw new Exception("The API key used is rate limited. Please try again in a few minutes.");
  201. }
  202. $json = json_decode($json, true);
  203. foreach($json["results"] as $result){
  204. $out["web"][] = [
  205. "title" => $result["title"],
  206. "description" => str_replace("\n", " ", $result["description"]),
  207. "url" => $result["url"],
  208. "date" => null,
  209. "type" => "web",
  210. "thumb" => [
  211. "url" => null,
  212. "ratio" => null
  213. ],
  214. "sublink" => [],
  215. "table" => []
  216. ];
  217. }
  218. return $out;
  219. }
  220. // HTML parser
  221. $proxy = $this->backend->get_ip();
  222. if($get["npt"]){
  223. [$params, $proxy] =
  224. $this->backend->get(
  225. $get["npt"],
  226. "web"
  227. );
  228. try{
  229. $html =
  230. $this->get(
  231. $proxy,
  232. "https://search.marginalia.nu/search?" . $params
  233. );
  234. }catch(Exception $error){
  235. throw new Exception("Failed to get HTML");
  236. }
  237. }else{
  238. $params = [
  239. "query" => $search
  240. ];
  241. foreach(["adtech", "recent", "intitle"] as $v){
  242. if($get[$v] == "yes"){
  243. switch($v){
  244. case "adtech": $params["adtech"] = "reduce"; break;
  245. case "recent": $params["recent"] = "recent"; break;
  246. case "adtech": $params["searchTitle"] = "title"; break;
  247. }
  248. }
  249. }
  250. try{
  251. $html =
  252. $this->get(
  253. $proxy,
  254. "https://search.marginalia.nu/search",
  255. $params
  256. );
  257. }catch(Exception $error){
  258. throw new Exception("Failed to get HTML");
  259. }
  260. }
  261. $this->fuckhtml->load($html);
  262. $sections =
  263. $this->fuckhtml
  264. ->getElementsByClassName(
  265. "card search-result",
  266. "section"
  267. );
  268. foreach($sections as $section){
  269. $this->fuckhtml->load($section);
  270. $title =
  271. $this->fuckhtml
  272. ->getElementsByClassName(
  273. "title",
  274. "a"
  275. )[0];
  276. $description =
  277. $this->fuckhtml
  278. ->getElementsByClassName(
  279. "description",
  280. "p"
  281. );
  282. if(count($description) !== 0){
  283. $description =
  284. $this->fuckhtml
  285. ->getTextContent(
  286. $description[0]
  287. );
  288. }else{
  289. $description = null;
  290. }
  291. $sublinks = [];
  292. $sublink_html =
  293. $this->fuckhtml
  294. ->getElementsByClassName("additional-results");
  295. if(count($sublink_html) !== 0){
  296. $this->fuckhtml->load($sublink_html[0]);
  297. $links =
  298. $this->fuckhtml
  299. ->getElementsByTagName("a");
  300. foreach($links as $link){
  301. $sublinks[] = [
  302. "title" =>
  303. $this->fuckhtml
  304. ->getTextContent(
  305. $link
  306. ),
  307. "date" => null,
  308. "description" => null,
  309. "url" =>
  310. $this->fuckhtml
  311. ->getTextContent(
  312. $link["attributes"]["href"]
  313. )
  314. ];
  315. }
  316. }
  317. $out["web"][] = [
  318. "title" =>
  319. $this->fuckhtml
  320. ->getTextContent(
  321. $title
  322. ),
  323. "description" => $description,
  324. "url" =>
  325. $this->fuckhtml
  326. ->getTextContent(
  327. $title["attributes"]["href"]
  328. ),
  329. "date" => null,
  330. "type" => "web",
  331. "thumb" => [
  332. "url" => null,
  333. "ratio" => null
  334. ],
  335. "sublink" => $sublinks,
  336. "table" => []
  337. ];
  338. }
  339. // get next page
  340. $this->fuckhtml->load($html);
  341. $pagination =
  342. $this->fuckhtml
  343. ->getElementsByAttributeValue(
  344. "aria-label",
  345. "pagination",
  346. "nav"
  347. );
  348. if(count($pagination) === 0){
  349. // no pagination
  350. return $out;
  351. }
  352. $this->fuckhtml->load($pagination[0]);
  353. $pages =
  354. $this->fuckhtml
  355. ->getElementsByClassName(
  356. "page-link",
  357. "a"
  358. );
  359. $found_current_page = false;
  360. foreach($pages as $page){
  361. if(
  362. stripos(
  363. $page["attributes"]["class"],
  364. "active"
  365. ) !== false
  366. ){
  367. $found_current_page = true;
  368. continue;
  369. }
  370. if($found_current_page){
  371. // we found current page index, and we iterated over
  372. // the next page <a>
  373. $out["npt"] =
  374. $this->backend->store(
  375. parse_url(
  376. $page["attributes"]["href"],
  377. PHP_URL_QUERY
  378. ),
  379. "web",
  380. $proxy
  381. );
  382. break;
  383. }
  384. }
  385. return $out;
  386. }
  387. }