123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476 |
- <?php
- class marginalia{
- public function __construct(){
-
- include "lib/fuckhtml.php";
- $this->fuckhtml = new fuckhtml();
-
- include "lib/backend.php";
- $this->backend = new backend("marginalia");
- }
-
- public function getfilters($page){
-
- if(config::MARGINALIA_API_KEY === null){
-
- $base = [
- "adtech" => [
- "display" => "Reduce adtech",
- "option" => [
- "no" => "No",
- "yes" => "Yes"
- ]
- ],
- "recent" => [
- "display" => "Recent results",
- "option" => [
- "no" => "No",
- "yes" => "Yes"
- ]
- ],
- "intitle" => [
- "display" => "Search in title",
- "option" => [
- "no" => "No",
- "yes" => "Yes"
- ]
- ]
- ];
- }else{
-
- $base = [];
- }
-
- return array_merge(
- $base,
- [
- "format" => [
- "display" => "Format",
- "option" => [
- "any" => "Any format",
- "html5" => "html5",
- "xhtml" => "xhtml",
- "html123" => "html123"
- ]
- ],
- "file" => [
- "display" => "Filetype",
- "option" => [
- "any" => "Any filetype",
- "nomedia" => "Deny media",
- "media" => "Contains media",
- "audio" => "Contains audio",
- "video" => "Contains video",
- "archive" => "Contains archive",
- "document" => "Contains document"
- ]
- ],
- "javascript" => [
- "display" => "Javascript",
- "option" => [
- "any" => "Allow JS",
- "deny" => "Deny JS",
- "require" => "Require JS"
- ]
- ],
- "trackers" => [
- "display" => "Trackers",
- "option" => [
- "any" => "Allow trackers",
- "deny" => "Deny trackers",
- "require" => "Require trackers"
- ]
- ],
- "cookies" => [
- "display" => "Cookies",
- "option" => [
- "any" => "Allow cookies",
- "deny" => "Deny cookies",
- "require" => "Require cookies"
- ]
- ],
- "affiliate" => [
- "display" => "Affiliate links in body",
- "option" => [
- "any" => "Allow affiliate links",
- "deny" => "Deny affiliate links",
- "require" => "Require affiliate links"
- ]
- ]
- ]
- );
- }
-
- private function get($proxy, $url, $get = []){
-
- $headers = [
- "User-Agent: " . config::USER_AGENT,
- "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
- "Accept-Language: en-US,en;q=0.5",
- "Accept-Encoding: gzip",
- "DNT: 1",
- "Connection: keep-alive",
- "Upgrade-Insecure-Requests: 1",
- "Sec-Fetch-Dest: document",
- "Sec-Fetch-Mode: navigate",
- "Sec-Fetch-Site: none",
- "Sec-Fetch-User: ?1"
- ];
-
- $curlproc = curl_init();
-
- if($get !== []){
- $get = http_build_query($get);
- $url .= "?" . $get;
- }
-
- curl_setopt($curlproc, CURLOPT_URL, $url);
-
- curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
- curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers);
-
- curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
- curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
- curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
- curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
- curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
- $this->backend->assign_proxy($curlproc, $proxy);
-
- $data = curl_exec($curlproc);
-
- if(curl_errno($curlproc)){
-
- throw new Exception(curl_error($curlproc));
- }
-
- curl_close($curlproc);
- return $data;
- }
-
- public function web($get){
-
- $search = [$get["s"]];
- if(strlen($get["s"]) === 0){
-
- throw new Exception("Search term is empty!");
- }
-
- $format = $get["format"];
- $file = $get["file"];
-
- foreach(
- [
- "javascript" => $get["javascript"],
- "trackers" => $get["trackers"],
- "cookies" => $get["cookies"],
- "affiliate" => $get["affiliate"]
- ]
- as $key => $value
- ){
-
- if($value == "any"){ continue; }
-
- switch($key){
-
- case "javascript": $str = "js:true"; break;
- case "trackers": $str = "special:tracking"; break;
- case "cookies": $str = "special:cookies"; break;
- case "affiliate": $str = "special:affiliate"; break;
- }
-
- if($value == "deny"){
- $str = "-" . $str;
- }
-
- $search[] = $str;
- }
-
- if($format != "any"){
-
- $search[] = "format:$format";
- }
-
- switch($file){
-
- case "any": break;
- case "nomedia": $search[] = "-special:media"; break;
- case "media": $search[] = "special:media"; break;
-
- default:
- $search[] = "file:$file";
- }
-
- $search = implode(" ", $search);
-
- $out = [
- "status" => "ok",
- "spelling" => [
- "type" => "no_correction",
- "using" => null,
- "correction" => null
- ],
- "npt" => null,
- "answer" => [],
- "web" => [],
- "image" => [],
- "video" => [],
- "news" => [],
- "related" => []
- ];
-
- // API scraper
- if(config::MARGINALIA_API_KEY !== null){
-
- try{
- $json =
- $this->get(
- $this->backend->get_ip(), // no nextpage
- "https://api.marginalia.nu/" . config::MARGINALIA_API_KEY . "/search/" . urlencode($search),
- [
- "count" => 20
- ]
- );
- }catch(Exception $error){
-
- throw new Exception("Failed to get JSON");
- }
-
- if($json == "Slow down"){
-
- throw new Exception("The API key used is rate limited. Please try again in a few minutes.");
- }
-
- $json = json_decode($json, true);
-
- foreach($json["results"] as $result){
-
- $out["web"][] = [
- "title" => $result["title"],
- "description" => str_replace("\n", " ", $result["description"]),
- "url" => $result["url"],
- "date" => null,
- "type" => "web",
- "thumb" => [
- "url" => null,
- "ratio" => null
- ],
- "sublink" => [],
- "table" => []
- ];
- }
-
- return $out;
- }
-
- // HTML parser
- $proxy = $this->backend->get_ip();
-
- if($get["npt"]){
-
- [$params, $proxy] =
- $this->backend->get(
- $get["npt"],
- "web"
- );
-
- try{
- $html =
- $this->get(
- $proxy,
- "https://search.marginalia.nu/search?" . $params
- );
- }catch(Exception $error){
-
- throw new Exception("Failed to get HTML");
- }
-
- }else{
- $params = [
- "query" => $search
- ];
-
- foreach(["adtech", "recent", "intitle"] as $v){
-
- if($get[$v] == "yes"){
-
- switch($v){
-
- case "adtech": $params["adtech"] = "reduce"; break;
- case "recent": $params["recent"] = "recent"; break;
- case "adtech": $params["searchTitle"] = "title"; break;
- }
- }
- }
-
- try{
- $html =
- $this->get(
- $proxy,
- "https://search.marginalia.nu/search",
- $params
- );
- }catch(Exception $error){
-
- throw new Exception("Failed to get HTML");
- }
- }
-
- $this->fuckhtml->load($html);
-
- $sections =
- $this->fuckhtml
- ->getElementsByClassName(
- "card search-result",
- "section"
- );
-
- foreach($sections as $section){
-
- $this->fuckhtml->load($section);
-
- $title =
- $this->fuckhtml
- ->getElementsByClassName(
- "title",
- "a"
- )[0];
-
- $description =
- $this->fuckhtml
- ->getElementsByClassName(
- "description",
- "p"
- );
-
- if(count($description) !== 0){
-
- $description =
- $this->fuckhtml
- ->getTextContent(
- $description[0]
- );
- }else{
-
- $description = null;
- }
-
- $sublinks = [];
- $sublink_html =
- $this->fuckhtml
- ->getElementsByClassName("additional-results");
-
- if(count($sublink_html) !== 0){
-
- $this->fuckhtml->load($sublink_html[0]);
-
- $links =
- $this->fuckhtml
- ->getElementsByTagName("a");
-
- foreach($links as $link){
-
- $sublinks[] = [
- "title" =>
- $this->fuckhtml
- ->getTextContent(
- $link
- ),
- "date" => null,
- "description" => null,
- "url" =>
- $this->fuckhtml
- ->getTextContent(
- $link["attributes"]["href"]
- )
- ];
- }
- }
-
- $out["web"][] = [
- "title" =>
- $this->fuckhtml
- ->getTextContent(
- $title
- ),
- "description" => $description,
- "url" =>
- $this->fuckhtml
- ->getTextContent(
- $title["attributes"]["href"]
- ),
- "date" => null,
- "type" => "web",
- "thumb" => [
- "url" => null,
- "ratio" => null
- ],
- "sublink" => $sublinks,
- "table" => []
- ];
- }
-
- // get next page
- $this->fuckhtml->load($html);
-
- $pagination =
- $this->fuckhtml
- ->getElementsByAttributeValue(
- "aria-label",
- "pagination",
- "nav"
- );
-
- if(count($pagination) === 0){
-
- // no pagination
- return $out;
- }
-
- $this->fuckhtml->load($pagination[0]);
-
- $pages =
- $this->fuckhtml
- ->getElementsByClassName(
- "page-link",
- "a"
- );
-
- $found_current_page = false;
-
- foreach($pages as $page){
-
- if(
- stripos(
- $page["attributes"]["class"],
- "active"
- ) !== false
- ){
-
- $found_current_page = true;
- continue;
- }
-
- if($found_current_page){
-
- // we found current page index, and we iterated over
- // the next page <a>
-
- $out["npt"] =
- $this->backend->store(
- parse_url(
- $page["attributes"]["href"],
- PHP_URL_QUERY
- ),
- "web",
- $proxy
- );
- break;
- }
- }
-
- return $out;
- }
- }
-
|