mojeek.php 24 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174
  1. <?php
  2. class mojeek{
  3. public function __construct(){
  4. include "lib/fuckhtml.php";
  5. $this->fuckhtml = new fuckhtml();
  6. include "lib/backend.php";
  7. $this->backend = new backend("mojeek");
  8. }
  9. public function getfilters($page){
  10. switch($page){
  11. case "web":
  12. return [
  13. "focus" => [
  14. "display" => "Focus",
  15. "option" => [
  16. "any" => "No focus",
  17. "blogs" => "Blogs",
  18. "Dictionary" => "Dictionary",
  19. "Recipes" => "Recipes",
  20. "Time" => "Time",
  21. "Weather" => "Weather"
  22. ]
  23. ],
  24. "lang" => [
  25. "display" => "Language",
  26. "option" => [
  27. "any" => "Any language",
  28. "af" => "Afrikaans",
  29. "sq" => "Albanian",
  30. "an" => "Aragonese",
  31. "ay" => "Aymara",
  32. "bi" => "Bislama",
  33. "br" => "Breton",
  34. "ca" => "Catalan",
  35. "kw" => "Cornish",
  36. "co" => "Corsican",
  37. "hr" => "Croatian",
  38. "da" => "Danish",
  39. "nl" => "Dutch",
  40. "dz" => "Dzongkha",
  41. "en" => "English",
  42. "fj" => "Fijian",
  43. "fi" => "Finnish",
  44. "fr" => "French",
  45. "gd" => "Gaelic",
  46. "gl" => "Galician",
  47. "de" => "German",
  48. "ht" => "Haitian",
  49. "io" => "Ido",
  50. "id" => "Indonesian",
  51. "ia" => "Interlingua",
  52. "ie" => "Interlingue",
  53. "ga" => "Irish",
  54. "it" => "Italian",
  55. "rw" => "Kinyarwanda",
  56. "la" => "Latin",
  57. "li" => "Limburgish",
  58. "lb" => "Luxembourgish",
  59. "no" => "Norwegian",
  60. "nb" => "Norwegian Bokmål",
  61. "nn" => "Norwegian Nynorsk",
  62. "oc" => "Occitan (post 1500)",
  63. "pl" => "Polish",
  64. "pt" => "Portuguese",
  65. "rm" => "Romansh",
  66. "rn" => "Rundi",
  67. "sg" => "Sango",
  68. "so" => "Somali",
  69. "es" => "Spanish",
  70. "sw" => "Swahili",
  71. "ss" => "Swati",
  72. "sv" => "Swedish",
  73. "ty" => "Tahitian",
  74. "to" => "Tonga (Tonga Islands)",
  75. "ts" => "Tsonga",
  76. "vo" => "Volapük",
  77. "wa" => "Walloon",
  78. "cy" => "Welsh",
  79. "xh" => "Xhosa",
  80. "zu" => "Zulu"
  81. ]
  82. ],
  83. "country" => [
  84. "display" => "Country",
  85. "option" => [
  86. "any" => "No location bias",
  87. "af" => "Afghanistan",
  88. "ax" => "Åland Islands",
  89. "al" => "Albania",
  90. "dz" => "Algeria",
  91. "as" => "American Samoa",
  92. "ad" => "Andorra",
  93. "ao" => "Angola",
  94. "ai" => "Anguilla",
  95. "aq" => "Antarctica",
  96. "ag" => "Antigua and Barbuda",
  97. "ar" => "Argentina",
  98. "am" => "Armenia",
  99. "aw" => "Aruba",
  100. "au" => "Australia",
  101. "at" => "Austria",
  102. "az" => "Azerbaijan",
  103. "bs" => "Bahamas",
  104. "bh" => "Bahrain",
  105. "bd" => "Bangladesh",
  106. "bb" => "Barbados",
  107. "by" => "Belarus",
  108. "be" => "Belgium",
  109. "bz" => "Belize",
  110. "bj" => "Benin",
  111. "bm" => "Bermuda",
  112. "bt" => "Bhutan",
  113. "bo" => "Bolivia (Plurinational State of)",
  114. "bq" => "Bonaire, Sint Eustatius and Saba",
  115. "ba" => "Bosnia and Herzegovina",
  116. "bw" => "Botswana",
  117. "bv" => "Bouvet Island",
  118. "br" => "Brazil",
  119. "io" => "British Indian Ocean Territory",
  120. "bn" => "Brunei Darussalam",
  121. "bg" => "Bulgaria",
  122. "bf" => "Burkina Faso",
  123. "bi" => "Burundi",
  124. "cv" => "Cabo Verde",
  125. "kh" => "Cambodia",
  126. "cm" => "Cameroon",
  127. "ca" => "Canada",
  128. "ky" => "Cayman Islands",
  129. "cf" => "Central African Republic",
  130. "td" => "Chad",
  131. "cl" => "Chile",
  132. "cn" => "China",
  133. "cx" => "Christmas Island",
  134. "cc" => "Cocos (Keeling) Islands",
  135. "co" => "Colombia",
  136. "km" => "Comoros",
  137. "cg" => "Congo",
  138. "cd" => "Congo (Democratic Republic of the)",
  139. "ck" => "Cook Islands",
  140. "cr" => "Costa Rica",
  141. "ci" => "Côte d'Ivoire",
  142. "hr" => "Croatia",
  143. "cu" => "Cuba",
  144. "cw" => "Curaçao",
  145. "cy" => "Cyprus",
  146. "cz" => "Czechia",
  147. "dk" => "Denmark",
  148. "dj" => "Djibouti",
  149. "dm" => "Dominica",
  150. "do" => "Dominican Republic",
  151. "ec" => "Ecuador",
  152. "eg" => "Egypt",
  153. "sv" => "El Salvador",
  154. "gq" => "Equatorial Guinea",
  155. "er" => "Eritrea",
  156. "ee" => "Estonia",
  157. "et" => "Ethiopia",
  158. "fk" => "Falkland Islands (Malvinas)",
  159. "fo" => "Faroe Islands",
  160. "fj" => "Fiji",
  161. "fi" => "Finland",
  162. "fr" => "France",
  163. "gf" => "French Guiana",
  164. "pf" => "French Polynesia",
  165. "tf" => "French Southern Territories",
  166. "ga" => "Gabon",
  167. "gm" => "Gambia",
  168. "ge" => "Georgia",
  169. "de" => "Germany",
  170. "gh" => "Ghana",
  171. "gi" => "Gibraltar",
  172. "gr" => "Greece",
  173. "gl" => "Greenland",
  174. "gd" => "Grenada",
  175. "gp" => "Guadeloupe",
  176. "gu" => "Guam",
  177. "gt" => "Guatemala",
  178. "gg" => "Guernsey",
  179. "gn" => "Guinea",
  180. "gw" => "Guinea-Bissau",
  181. "gy" => "Guyana",
  182. "ht" => "Haiti",
  183. "hm" => "Heard Island and McDonald Islands",
  184. "va" => "Holy See",
  185. "hn" => "Honduras",
  186. "hk" => "Hong Kong",
  187. "hu" => "Hungary",
  188. "is" => "Iceland",
  189. "in" => "India",
  190. "id" => "Indonesia",
  191. "ir" => "Iran (Islamic Republic of)",
  192. "iq" => "Iraq",
  193. "ie" => "Ireland",
  194. "im" => "Isle of Man",
  195. "il" => "Israel",
  196. "it" => "Italy",
  197. "jm" => "Jamaica",
  198. "jp" => "Japan",
  199. "je" => "Jersey",
  200. "jo" => "Jordan",
  201. "kz" => "Kazakhstan",
  202. "ke" => "Kenya",
  203. "ki" => "Kiribati",
  204. "kp" => "Korea (Democratic People's Republic of)",
  205. "kr" => "Korea (Republic of)",
  206. "kw" => "Kuwait",
  207. "kg" => "Kyrgyzstan",
  208. "la" => "Lao People's Democratic Republic",
  209. "lv" => "Latvia",
  210. "lb" => "Lebanon",
  211. "ls" => "Lesotho",
  212. "lr" => "Liberia",
  213. "ly" => "Libya",
  214. "li" => "Liechtenstein",
  215. "lt" => "Lithuania",
  216. "lu" => "Luxembourg",
  217. "mo" => "Macao",
  218. "mk" => "Macedonia (the former Yugoslav Republic of)",
  219. "mg" => "Madagascar",
  220. "mw" => "Malawi",
  221. "my" => "Malaysia",
  222. "mv" => "Maldives",
  223. "ml" => "Mali",
  224. "mt" => "Malta",
  225. "mh" => "Marshall Islands",
  226. "mq" => "Martinique",
  227. "mr" => "Mauritania",
  228. "mu" => "Mauritius",
  229. "yt" => "Mayotte",
  230. "mx" => "Mexico",
  231. "fm" => "Micronesia (Federated States of)",
  232. "md" => "Moldova (Republic of)",
  233. "mc" => "Monaco",
  234. "mn" => "Mongolia",
  235. "me" => "Montenegro",
  236. "ms" => "Montserrat",
  237. "ma" => "Morocco",
  238. "mz" => "Mozambique",
  239. "mm" => "Myanmar",
  240. "na" => "Namibia",
  241. "nr" => "Nauru",
  242. "np" => "Nepal",
  243. "nl" => "Netherlands",
  244. "nc" => "New Caledonia",
  245. "nz" => "New Zealand",
  246. "ni" => "Nicaragua",
  247. "ne" => "Niger",
  248. "ng" => "Nigeria",
  249. "nu" => "Niue",
  250. "nf" => "Norfolk Island",
  251. "mp" => "Northern Mariana Islands",
  252. "no" => "Norway",
  253. "om" => "Oman",
  254. "pk" => "Pakistan",
  255. "pw" => "Palau",
  256. "ps" => "Palestine, State of",
  257. "pa" => "Panama",
  258. "pg" => "Papua New Guinea",
  259. "py" => "Paraguay",
  260. "pe" => "Peru",
  261. "ph" => "Philippines",
  262. "pn" => "Pitcairn",
  263. "pl" => "Poland",
  264. "pt" => "Portugal",
  265. "pr" => "Puerto Rico",
  266. "qa" => "Qatar",
  267. "re" => "Réunion",
  268. "ro" => "Romania",
  269. "ru" => "Russian Federation",
  270. "rw" => "Rwanda",
  271. "bl" => "Saint Barthélemy",
  272. "sh" => "Saint Helena, Ascension and Tristan da Cunha",
  273. "kn" => "Saint Kitts and Nevis",
  274. "lc" => "Saint Lucia",
  275. "mf" => "Saint Martin (French part)",
  276. "pm" => "Saint Pierre and Miquelon",
  277. "vc" => "Saint Vincent and the Grenadines",
  278. "ws" => "Samoa",
  279. "sm" => "San Marino",
  280. "st" => "Sao Tome and Principe",
  281. "sa" => "Saudi Arabia",
  282. "sn" => "Senegal",
  283. "rs" => "Serbia",
  284. "sc" => "Seychelles",
  285. "sl" => "Sierra Leone",
  286. "sg" => "Singapore",
  287. "sx" => "Sint Maarten (Dutch part)",
  288. "sk" => "Slovakia",
  289. "si" => "Slovenia",
  290. "sb" => "Solomon Islands",
  291. "so" => "Somalia",
  292. "za" => "South Africa",
  293. "gs" => "South Georgia and South Sandwich Islands",
  294. "ss" => "South Sudan",
  295. "es" => "Spain",
  296. "lk" => "Sri Lanka",
  297. "sd" => "Sudan",
  298. "sr" => "Suriname",
  299. "sj" => "Svalbard and Jan Mayen",
  300. "sz" => "Swaziland",
  301. "se" => "Sweden",
  302. "ch" => "Switzerland",
  303. "sy" => "Syrian Arab Republic",
  304. "tw" => "Taiwan",
  305. "tj" => "Tajikistan",
  306. "tz" => "Tanzania, United Republic of",
  307. "th" => "Thailand",
  308. "tl" => "Timor-Leste",
  309. "tg" => "Togo",
  310. "tk" => "Tokelau",
  311. "to" => "Tonga",
  312. "tt" => "Trinidad and Tobago",
  313. "tn" => "Tunisia",
  314. "tr" => "Turkey",
  315. "tm" => "Turkmenistan",
  316. "tc" => "Turks and Caicos Islands",
  317. "tv" => "Tuvalu",
  318. "ug" => "Uganda",
  319. "ua" => "Ukraine",
  320. "ae" => "United Arab Emirates",
  321. "gb" => "United Kingdom",
  322. "us" => "United States of America",
  323. "um" => "United States Minor Outlying Islands",
  324. "uy" => "Uruguay",
  325. "uz" => "Uzbekistan",
  326. "vu" => "Vanuatu",
  327. "ve" => "Venezuela (Bolivarian Republic of)",
  328. "vn" => "Viet Nam",
  329. "vg" => "Virgin Islands (British)",
  330. "vi" => "Virgin Islands (U.S.)",
  331. "wf" => "Wallis and Futuna",
  332. "eh" => "Western Sahara",
  333. "ye" => "Yemen",
  334. "zm" => "Zambia",
  335. "zw" => "Zimbabwe"
  336. ]
  337. ],
  338. "region" => [
  339. "display" => "Region",
  340. "option" => [
  341. "any" => "Any region",
  342. "eu" => "European Union",
  343. "de" => "Germany",
  344. "fr" => "France",
  345. "uk" => "United Kingdom"
  346. ]
  347. ],
  348. "domain" => [
  349. "display" => "Results per domain",
  350. "option" => [
  351. "1" => "1 result",
  352. "2" => "2 results",
  353. "3" => "3 results",
  354. "4" => "4 results",
  355. "5" => "5 results",
  356. "10" => "10 results",
  357. "0" => "Unlimited",
  358. ]
  359. ]
  360. ];
  361. break;
  362. case "news":
  363. return [];
  364. }
  365. }
  366. private function get($proxy, $url, $get = []){
  367. $headers = [
  368. "User-Agent: " . config::USER_AGENT,
  369. "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
  370. "Accept-Language: en-US,en;q=0.5",
  371. "Accept-Encoding: gzip",
  372. "DNT: 1",
  373. "Connection: keep-alive",
  374. "Upgrade-Insecure-Requests: 1",
  375. "Sec-Fetch-Dest: document",
  376. "Sec-Fetch-Mode: navigate",
  377. "Sec-Fetch-Site: none",
  378. "Sec-Fetch-User: ?1"
  379. ];
  380. $curlproc = curl_init();
  381. if($get !== []){
  382. $get = http_build_query($get);
  383. $url .= "?" . $get;
  384. }
  385. curl_setopt($curlproc, CURLOPT_URL, $url);
  386. curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
  387. curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers);
  388. curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
  389. curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
  390. curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
  391. curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
  392. curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
  393. $this->backend->assign_proxy($curlproc, $proxy);
  394. $data = curl_exec($curlproc);
  395. if(curl_errno($curlproc)){
  396. throw new Exception(curl_error($curlproc));
  397. }
  398. curl_close($curlproc);
  399. return $data;
  400. }
  401. public function web($get){
  402. if($get["npt"]){
  403. [$token, $proxy] = $this->backend->get($get["npt"], "web");
  404. try{
  405. $html =
  406. $this->get(
  407. $proxy,
  408. "https://www.mojeek.com" . $token,
  409. []
  410. );
  411. }catch(Exception $error){
  412. throw new Exception("Failed to get HTML");
  413. }
  414. }else{
  415. $search = $get["s"];
  416. if(strlen($search) === 0){
  417. throw new Exception("Search term is empty!");
  418. }
  419. $proxy = $this->backend->get_ip();
  420. $lang = $get["lang"];
  421. $country = $get["country"];
  422. $region = $get["region"];
  423. $domain = $get["domain"];
  424. $focus = $get["focus"];
  425. $params = [
  426. "q" => $search,
  427. "t" => 20, // number of results/page
  428. "tn" => 7, // number of news results/page
  429. "date" => 1, // show date
  430. "tlen" => 128, // max length of title
  431. "dlen" => 511, // max length of description
  432. "arc" => ($country == "any" ? "none" : $country) // location. don't use autodetect!
  433. ];
  434. switch($focus){
  435. case "any": break;
  436. case "blogs":
  437. $params["fmt"] = "sst";
  438. $params["sst"] = "1";
  439. break;
  440. default:
  441. $params["foc_t"] = $focus;
  442. break;
  443. }
  444. if($lang != "any"){
  445. $params["lb"] = $lang;
  446. }
  447. if($region != "any"){
  448. $params["reg"] = $region;
  449. }
  450. if($domain != "1"){
  451. $params["si"] = $domain;
  452. }
  453. try{
  454. $html =
  455. $this->get(
  456. $proxy,
  457. "https://www.mojeek.com/search",
  458. $params
  459. );
  460. }catch(Exception $error){
  461. throw new Exception("Failed to get HTML");
  462. }
  463. /*
  464. $handle = fopen("scraper/mojeek.html", "r");
  465. $html = fread($handle, filesize("scraper/mojeek.html"));
  466. fclose($handle);*/
  467. }
  468. $out = [
  469. "status" => "ok",
  470. "spelling" => [
  471. "type" => "no_correction",
  472. "using" => null,
  473. "correction" => null
  474. ],
  475. "npt" => null,
  476. "answer" => [],
  477. "web" => [],
  478. "image" => [],
  479. "video" => [],
  480. "news" => [],
  481. "related" => []
  482. ];
  483. $this->fuckhtml->load($html);
  484. $results =
  485. $this->fuckhtml
  486. ->getElementsByClassName("results-standard", "ul");
  487. if(count($results) === 0){
  488. return $out;
  489. }
  490. /*
  491. Get all search result divs
  492. */
  493. foreach($results as $container){
  494. $this->fuckhtml->load($container);
  495. $results =
  496. $this->fuckhtml
  497. ->getElementsByTagName("li");
  498. foreach($results as $result){
  499. $data = [
  500. "title" => null,
  501. "description" => null,
  502. "url" => null,
  503. "date" => null,
  504. "type" => "web",
  505. "thumb" => [
  506. "url" => null,
  507. "ratio" => null
  508. ],
  509. "sublink" => [],
  510. "table" => []
  511. ];
  512. $this->fuckhtml->load($result);
  513. $title =
  514. $this->fuckhtml
  515. ->getElementsByClassName("title", "a")[0];
  516. $data["title"] =
  517. html_entity_decode(
  518. $this->fuckhtml
  519. ->getTextContent(
  520. $title["innerHTML"]
  521. )
  522. );
  523. $data["url"] =
  524. html_entity_decode(
  525. $this->fuckhtml
  526. ->getTextContent(
  527. $title["attributes"]["href"]
  528. )
  529. );
  530. $description =
  531. $this->fuckhtml
  532. ->getElementsByClassName(
  533. "s", "p"
  534. );
  535. if(count($description) !== 0){
  536. $data["description"] =
  537. $this->titledots(
  538. html_entity_decode(
  539. $this->fuckhtml
  540. ->getTextContent(
  541. $description[0]
  542. )
  543. )
  544. );
  545. }
  546. $date =
  547. $this->fuckhtml
  548. ->getElementsByClassName(
  549. "mdate",
  550. "span"
  551. );
  552. if(count($date) !== 0){
  553. $data["date"] =
  554. strtotime(
  555. $this->fuckhtml
  556. ->getTextContent(
  557. $date[0]
  558. )
  559. );
  560. }
  561. $out["web"][] = $data;
  562. }
  563. }
  564. /*
  565. Get instant answers
  566. */
  567. $this->fuckhtml->load($html);
  568. $infoboxes =
  569. $this->fuckhtml
  570. ->getElementsByClassName(
  571. "infobox infobox-top",
  572. "div"
  573. );
  574. foreach($infoboxes as $infobox){
  575. $answer = [
  576. "title" => null,
  577. "description" => [],
  578. "url" => null,
  579. "thumb" => null,
  580. "table" => [],
  581. "sublink" => []
  582. ];
  583. // load first part with title + short definition
  584. $infobox_html =
  585. explode(
  586. "<hr>",
  587. $infobox["innerHTML"]
  588. );
  589. $this->fuckhtml->load($infobox_html[0]);
  590. // title
  591. $answer["title"] =
  592. $this->fuckhtml
  593. ->getTextContent(
  594. $this->fuckhtml
  595. ->getElementsByTagName("h1")[0]
  596. );
  597. // short definition
  598. $definition =
  599. $this->fuckhtml
  600. ->getElementsByTagName(
  601. "p"
  602. );
  603. if(count($definition) !== 0){
  604. $answer["description"][] = [
  605. "type" => "quote",
  606. "value" =>
  607. $this->fuckhtml
  608. ->getTextContent(
  609. $definition[0]
  610. )
  611. ];
  612. }
  613. // get thumbnail, if it exists
  614. $this->fuckhtml->load($infobox_html[1]);
  615. $thumb =
  616. $this->fuckhtml
  617. ->getElementsByClassName("float-right", "img");
  618. if(count($thumb) !== 0){
  619. preg_match(
  620. '/\/image\?img=([^&]+)/i',
  621. $thumb[0]["attributes"]["src"],
  622. $thumb
  623. );
  624. if(count($thumb) === 2){
  625. $answer["thumb"] =
  626. urldecode(
  627. $this->fuckhtml
  628. ->getTextContent(
  629. $thumb[1]
  630. )
  631. );
  632. }
  633. }
  634. // get description
  635. $ps =
  636. $this->fuckhtml
  637. ->getElementsByTagName("p");
  638. $first_tag = true;
  639. foreach($ps as $p){
  640. $this->fuckhtml->load($p);
  641. if(
  642. preg_match(
  643. '/^\s*<strong>/i',
  644. $p["innerHTML"]
  645. )
  646. ){
  647. /*
  648. Parse table
  649. */
  650. $strong =
  651. $this->fuckhtml
  652. ->getElementsByTagName("strong")[0];
  653. $p["innerHTML"] =
  654. str_replace($strong["innerHTML"], "", $p["innerHTML"]);
  655. $strong =
  656. preg_replace(
  657. '/:$/',
  658. "",
  659. ucfirst(
  660. $this->fuckhtml
  661. ->getTextContent(
  662. $strong
  663. )
  664. )
  665. );
  666. $answer["table"][trim($strong)] =
  667. trim(
  668. $this->fuckhtml
  669. ->getTextContent(
  670. $p
  671. )
  672. );
  673. continue;
  674. }
  675. $as =
  676. $this->fuckhtml
  677. ->getElementsByClassName("svg-icon");
  678. if(count($as) !== 0){
  679. /*
  680. Parse websites
  681. */
  682. foreach($as as $a){
  683. $answer["sublink"][
  684. ucfirst(explode(" ", $a["attributes"]["class"], 2)[1])
  685. ] =
  686. $this->fuckhtml
  687. ->getTextContent(
  688. $a["attributes"]["href"]
  689. );
  690. }
  691. continue;
  692. }
  693. /*
  694. Parse text content
  695. */
  696. $tags =
  697. $this->fuckhtml
  698. ->getElementsByTagName("*");
  699. $i = 0;
  700. foreach($tags as $tag){
  701. $c = count($answer["description"]);
  702. // remove tag from innerHTML
  703. $p["innerHTML"] =
  704. explode($tag["outerHTML"], $p["innerHTML"], 2);
  705. if(count($p["innerHTML"]) === 2){
  706. if(
  707. $i === 0 &&
  708. $c !== 0 &&
  709. $answer["description"][$c - 1]["type"] == "link"
  710. ){
  711. $append = "\n\n";
  712. }else{
  713. $append = "";
  714. }
  715. if($p["innerHTML"][0] != ""){
  716. $answer["description"][] = [
  717. "type" => "text",
  718. "value" => $append . trim($p["innerHTML"][0])
  719. ];
  720. }
  721. $p["innerHTML"] = $p["innerHTML"][1];
  722. }else{
  723. $p["innerHTML"] = $p["innerHTML"][0];
  724. }
  725. switch($tag["tagName"]){
  726. case "a":
  727. $value =
  728. $this->fuckhtml
  729. ->getTextContent(
  730. $tag
  731. );
  732. if(strtolower($value) == "wikipedia"){
  733. if($c !== 0){
  734. $answer["description"][$c - 1]["value"] =
  735. rtrim($answer["description"][$c - 1]["value"]);
  736. }
  737. break;
  738. }
  739. $answer["description"][] = [
  740. "type" => "link",
  741. "url" =>
  742. $this->fuckhtml
  743. ->getTextContent(
  744. $tag["attributes"]["href"]
  745. ),
  746. "value" =>
  747. $this->fuckhtml
  748. ->getTextContent(
  749. $tag
  750. )
  751. ];
  752. break;
  753. }
  754. $i++;
  755. }
  756. }
  757. // get URL
  758. $this->fuckhtml->load($infobox_html[2]);
  759. $answer["url"] =
  760. $this->fuckhtml
  761. ->getTextContent(
  762. $this->fuckhtml
  763. ->getElementsByTagName(
  764. "a"
  765. )[0]
  766. ["attributes"]
  767. ["href"]
  768. );
  769. // append answer
  770. $out["answer"][] = $answer;
  771. }
  772. /*
  773. Get news
  774. */
  775. $this->fuckhtml->load($html);
  776. $news =
  777. $this->fuckhtml
  778. ->getElementsByClassName(
  779. "results news-results",
  780. "div"
  781. );
  782. if(count($news) !== 0){
  783. $this->fuckhtml->load($news[0]);
  784. $lis =
  785. $this->fuckhtml
  786. ->getElementsByTagName("li");
  787. foreach($lis as $li){
  788. $this->fuckhtml->load($li);
  789. $a =
  790. $this->fuckhtml
  791. ->getElementsByClassName(
  792. "ob",
  793. "a"
  794. );
  795. if(count($a) === 0){
  796. continue;
  797. }
  798. $a = $a[0];
  799. $date =
  800. explode(
  801. " - ",
  802. $this->fuckhtml
  803. ->getTextContent(
  804. $this->fuckhtml
  805. ->getElementsByTagName(
  806. "span"
  807. )[0]
  808. )
  809. );
  810. $date =
  811. strtotime(
  812. $date[count($date) - 1]
  813. );
  814. $out["news"][] = [
  815. "title" =>
  816. html_entity_decode(
  817. $this->fuckhtml
  818. ->getTextContent(
  819. $a
  820. )
  821. ),
  822. "description" => null,
  823. "date" => $date,
  824. "thumb" => [
  825. "url" => null,
  826. "ratio" => null
  827. ],
  828. "url" =>
  829. $this->fuckhtml
  830. ->getTextContent(
  831. $a["attributes"]["href"]
  832. )
  833. ];
  834. }
  835. }
  836. /*
  837. Get next page
  838. */
  839. $this->fuckhtml->load($html);
  840. $pagination =
  841. $this->fuckhtml
  842. ->getElementsByClassName("pagination");
  843. if(count($pagination) !== false){
  844. $this->fuckhtml->load($pagination[0]);
  845. $as =
  846. $this->fuckhtml
  847. ->getElementsByTagName("a");
  848. foreach($as as $a){
  849. if($a["innerHTML"] == "Next"){
  850. $out["npt"] = $this->backend->store(
  851. $this->fuckhtml
  852. ->getTextContent(
  853. $a["attributes"]["href"]
  854. ),
  855. "web",
  856. $proxy
  857. );
  858. }
  859. }
  860. }
  861. return $out;
  862. }
  863. public function news($get){
  864. $search = $get["s"];
  865. if(strlen($search) === 0){
  866. throw new Exception("Search term is empty!");
  867. }
  868. $out = [
  869. "status" => "ok",
  870. "npt" => null,
  871. "news" => []
  872. ];
  873. try{
  874. $html =
  875. $this->get(
  876. $this->backend->get_ip(),
  877. "https://www.mojeek.com/search",
  878. [
  879. "q" => $search,
  880. "fmt" => "news"
  881. ]
  882. );
  883. }catch(Exception $error){
  884. throw new Exception("Failed to get HTML");
  885. }
  886. /*
  887. $handle = fopen("scraper/mojeek.html", "r");
  888. $html = fread($handle, filesize("scraper/mojeek.html"));
  889. fclose($handle);
  890. */
  891. $this->fuckhtml->load($html);
  892. $articles =
  893. $this->fuckhtml->getElementsByTagName("article");
  894. foreach($articles as $article){
  895. $this->fuckhtml->load($article);
  896. $data = [
  897. "title" => null,
  898. "author" => null,
  899. "description" => null,
  900. "date" => null,
  901. "thumb" =>
  902. [
  903. "url" => null,
  904. "ratio" => null
  905. ],
  906. "url" => null
  907. ];
  908. $a = $this->fuckhtml->getElementsByTagName("a")[0];
  909. $data["title"] =
  910. $this->fuckhtml
  911. ->getTextContent(
  912. $a["attributes"]["title"]
  913. );
  914. $data["url"] =
  915. $this->fuckhtml
  916. ->getTextContent(
  917. $a["attributes"]["href"]
  918. );
  919. $p = $this->fuckhtml->getElementsByTagName("p");
  920. $data["description"] =
  921. $this->titledots(
  922. $this->fuckhtml
  923. ->getTextContent(
  924. $this->fuckhtml
  925. ->getElementsByClassName(
  926. "s",
  927. $p
  928. )[0]
  929. )
  930. );
  931. if($data["description"] == ""){
  932. $data["description"] = null;
  933. }
  934. // get date from big node
  935. $date =
  936. $this->fuckhtml
  937. ->getElementsByClassName(
  938. "date",
  939. $p
  940. );
  941. if(count($date) !== 0){
  942. $data["date"] =
  943. strtotime(
  944. $this->fuckhtml
  945. ->getTextContent(
  946. $date[0]
  947. )
  948. );
  949. }
  950. // grep date + author
  951. $s =
  952. $this->fuckhtml
  953. ->getElementsByClassName(
  954. "i",
  955. $p
  956. )[0];
  957. $this->fuckhtml->load($s);
  958. $a =
  959. $this->fuckhtml
  960. ->getElementsByTagName("a");
  961. if(count($a) !== 0){
  962. // parse big node information
  963. $data["author"] =
  964. htmlspecialchars_decode(
  965. $this->fuckhtml
  966. ->getTextContent(
  967. $a[0]["innerHTML"]
  968. )
  969. );
  970. }else{
  971. // parse smaller nodes
  972. $replace =
  973. $this->fuckhtml
  974. ->getElementsByTagName("time")[0];
  975. $data["date"] =
  976. strtotime(
  977. $this->fuckhtml
  978. ->getTextContent(
  979. $replace
  980. )
  981. );
  982. $s["innerHTML"] =
  983. str_replace(
  984. $replace["outerHTML"],
  985. "",
  986. $s["innerHTML"]
  987. );
  988. $data["author"] =
  989. preg_replace(
  990. '/ &bull; $/',
  991. "",
  992. $s["innerHTML"]
  993. );
  994. }
  995. $out["news"][] = $data;
  996. }
  997. return $out;
  998. }
  999. private function titledots($title){
  1000. return trim($title, ". \t\n\r\0\x0B");
  1001. }
  1002. }