google.php 90 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049
  1. <?php
  2. // @TODO check for consent.google.com page, if need be
  3. class google{
  4. public function __construct(){
  5. include "lib/fuckhtml.php";
  6. $this->fuckhtml = new fuckhtml();
  7. include "lib/backend.php";
  8. $this->backend = new backend("google");
  9. }
  10. public function getfilters($page){
  11. $base = [
  12. "country" => [ // gl=<country> (image: cr=countryAF)
  13. "display" => "Country",
  14. "option" => [
  15. "any" => "Instance's country",
  16. "af" => "Afghanistan",
  17. "al" => "Albania",
  18. "dz" => "Algeria",
  19. "as" => "American Samoa",
  20. "ad" => "Andorra",
  21. "ao" => "Angola",
  22. "ai" => "Anguilla",
  23. "aq" => "Antarctica",
  24. "ag" => "Antigua and Barbuda",
  25. "ar" => "Argentina",
  26. "am" => "Armenia",
  27. "aw" => "Aruba",
  28. "au" => "Australia",
  29. "at" => "Austria",
  30. "az" => "Azerbaijan",
  31. "bs" => "Bahamas",
  32. "bh" => "Bahrain",
  33. "bd" => "Bangladesh",
  34. "bb" => "Barbados",
  35. "by" => "Belarus",
  36. "be" => "Belgium",
  37. "bz" => "Belize",
  38. "bj" => "Benin",
  39. "bm" => "Bermuda",
  40. "bt" => "Bhutan",
  41. "bo" => "Bolivia",
  42. "ba" => "Bosnia and Herzegovina",
  43. "bw" => "Botswana",
  44. "bv" => "Bouvet Island",
  45. "br" => "Brazil",
  46. "io" => "British Indian Ocean Territory",
  47. "bn" => "Brunei Darussalam",
  48. "bg" => "Bulgaria",
  49. "bf" => "Burkina Faso",
  50. "bi" => "Burundi",
  51. "kh" => "Cambodia",
  52. "cm" => "Cameroon",
  53. "ca" => "Canada",
  54. "cv" => "Cape Verde",
  55. "ky" => "Cayman Islands",
  56. "cf" => "Central African Republic",
  57. "td" => "Chad",
  58. "cl" => "Chile",
  59. "cn" => "China",
  60. "cx" => "Christmas Island",
  61. "cc" => "Cocos (Keeling) Islands",
  62. "co" => "Colombia",
  63. "km" => "Comoros",
  64. "cg" => "Congo",
  65. "cd" => "Congo, the Democratic Republic",
  66. "ck" => "Cook Islands",
  67. "cr" => "Costa Rica",
  68. "ci" => "Cote D'ivoire",
  69. "hr" => "Croatia",
  70. "cu" => "Cuba",
  71. "cy" => "Cyprus",
  72. "cz" => "Czech Republic",
  73. "dk" => "Denmark",
  74. "dj" => "Djibouti",
  75. "dm" => "Dominica",
  76. "do" => "Dominican Republic",
  77. "ec" => "Ecuador",
  78. "eg" => "Egypt",
  79. "sv" => "El Salvador",
  80. "gq" => "Equatorial Guinea",
  81. "er" => "Eritrea",
  82. "ee" => "Estonia",
  83. "et" => "Ethiopia",
  84. "fk" => "Falkland Islands (Malvinas)",
  85. "fo" => "Faroe Islands",
  86. "fj" => "Fiji",
  87. "fi" => "Finland",
  88. "fr" => "France",
  89. "gf" => "French Guiana",
  90. "pf" => "French Polynesia",
  91. "tf" => "French Southern Territories",
  92. "ga" => "Gabon",
  93. "gm" => "Gambia",
  94. "ge" => "Georgia",
  95. "de" => "Germany",
  96. "gh" => "Ghana",
  97. "gi" => "Gibraltar",
  98. "gr" => "Greece",
  99. "gl" => "Greenland",
  100. "gd" => "Grenada",
  101. "gp" => "Guadeloupe",
  102. "gu" => "Guam",
  103. "gt" => "Guatemala",
  104. "gn" => "Guinea",
  105. "gw" => "Guinea-Bissau",
  106. "gy" => "Guyana",
  107. "ht" => "Haiti",
  108. "hm" => "Heard Island and Mcdonald Islands",
  109. "va" => "Holy See (Vatican City State)",
  110. "hn" => "Honduras",
  111. "hk" => "Hong Kong",
  112. "hu" => "Hungary",
  113. "is" => "Iceland",
  114. "in" => "India",
  115. "id" => "Indonesia",
  116. "ir" => "Iran, Islamic Republic",
  117. "iq" => "Iraq",
  118. "ie" => "Ireland",
  119. "il" => "Israel",
  120. "it" => "Italy",
  121. "jm" => "Jamaica",
  122. "jp" => "Japan",
  123. "jo" => "Jordan",
  124. "kz" => "Kazakhstan",
  125. "ke" => "Kenya",
  126. "ki" => "Kiribati",
  127. "kp" => "Korea, Democratic People's Republic",
  128. "kr" => "Korea, Republic",
  129. "kw" => "Kuwait",
  130. "kg" => "Kyrgyzstan",
  131. "la" => "Lao People's Democratic Republic",
  132. "lv" => "Latvia",
  133. "lb" => "Lebanon",
  134. "ls" => "Lesotho",
  135. "lr" => "Liberia",
  136. "ly" => "Libyan Arab Jamahiriya",
  137. "li" => "Liechtenstein",
  138. "lt" => "Lithuania",
  139. "lu" => "Luxembourg",
  140. "mo" => "Macao",
  141. "mk" => "Macedonia, the Former Yugosalv Republic",
  142. "mg" => "Madagascar",
  143. "mw" => "Malawi",
  144. "my" => "Malaysia",
  145. "mv" => "Maldives",
  146. "ml" => "Mali",
  147. "mt" => "Malta",
  148. "mh" => "Marshall Islands",
  149. "mq" => "Martinique",
  150. "mr" => "Mauritania",
  151. "mu" => "Mauritius",
  152. "yt" => "Mayotte",
  153. "mx" => "Mexico",
  154. "fm" => "Micronesia, Federated States",
  155. "md" => "Moldova, Republic",
  156. "mc" => "Monaco",
  157. "mn" => "Mongolia",
  158. "ms" => "Montserrat",
  159. "ma" => "Morocco",
  160. "mz" => "Mozambique",
  161. "mm" => "Myanmar",
  162. "na" => "Namibia",
  163. "nr" => "Nauru",
  164. "np" => "Nepal",
  165. "nl" => "Netherlands",
  166. "an" => "Netherlands Antilles",
  167. "nc" => "New Caledonia",
  168. "nz" => "New Zealand",
  169. "ni" => "Nicaragua",
  170. "ne" => "Niger",
  171. "ng" => "Nigeria",
  172. "nu" => "Niue",
  173. "nf" => "Norfolk Island",
  174. "mp" => "Northern Mariana Islands",
  175. "no" => "Norway",
  176. "om" => "Oman",
  177. "pk" => "Pakistan",
  178. "pw" => "Palau",
  179. "ps" => "Palestinian Territory, Occupied",
  180. "pa" => "Panama",
  181. "pg" => "Papua New Guinea",
  182. "py" => "Paraguay",
  183. "pe" => "Peru",
  184. "ph" => "Philippines",
  185. "pn" => "Pitcairn",
  186. "pl" => "Poland",
  187. "pt" => "Portugal",
  188. "pr" => "Puerto Rico",
  189. "qa" => "Qatar",
  190. "re" => "Reunion",
  191. "ro" => "Romania",
  192. "ru" => "Russian Federation",
  193. "rw" => "Rwanda",
  194. "sh" => "Saint Helena",
  195. "kn" => "Saint Kitts and Nevis",
  196. "lc" => "Saint Lucia",
  197. "pm" => "Saint Pierre and Miquelon",
  198. "vc" => "Saint Vincent and the Grenadines",
  199. "ws" => "Samoa",
  200. "sm" => "San Marino",
  201. "st" => "Sao Tome and Principe",
  202. "sa" => "Saudi Arabia",
  203. "sn" => "Senegal",
  204. "cs" => "Serbia and Montenegro",
  205. "sc" => "Seychelles",
  206. "sl" => "Sierra Leone",
  207. "sg" => "Singapore",
  208. "sk" => "Slovakia",
  209. "si" => "Slovenia",
  210. "sb" => "Solomon Islands",
  211. "so" => "Somalia",
  212. "za" => "South Africa",
  213. "gs" => "South Georgia and the South Sandwich Islands",
  214. "es" => "Spain",
  215. "lk" => "Sri Lanka",
  216. "sd" => "Sudan",
  217. "sr" => "Suriname",
  218. "sj" => "Svalbard and Jan Mayen",
  219. "sz" => "Swaziland",
  220. "se" => "Sweden",
  221. "ch" => "Switzerland",
  222. "sy" => "Syrian Arab Republic",
  223. "tw" => "Taiwan, Province of China",
  224. "tj" => "Tajikistan",
  225. "tz" => "Tanzania, United Republic",
  226. "th" => "Thailand",
  227. "tl" => "Timor-Leste",
  228. "tg" => "Togo",
  229. "tk" => "Tokelau",
  230. "to" => "Tonga",
  231. "tt" => "Trinidad and Tobago",
  232. "tn" => "Tunisia",
  233. "tr" => "Turkey",
  234. "tm" => "Turkmenistan",
  235. "tc" => "Turks and Caicos Islands",
  236. "tv" => "Tuvalu",
  237. "ug" => "Uganda",
  238. "ua" => "Ukraine",
  239. "ae" => "United Arab Emirates",
  240. "uk" => "United Kingdom",
  241. "us" => "United States",
  242. "um" => "United States Minor Outlying Islands",
  243. "uy" => "Uruguay",
  244. "uz" => "Uzbekistan",
  245. "vu" => "Vanuatu",
  246. "ve" => "Venezuela",
  247. "vn" => "Viet Nam",
  248. "vg" => "Virgin Islands, British",
  249. "vi" => "Virgin Islands, U.S.",
  250. "wf" => "Wallis and Futuna",
  251. "eh" => "Western Sahara",
  252. "ye" => "Yemen",
  253. "zm" => "Zambia",
  254. "zw" => "Zimbabwe"
  255. ]
  256. ],
  257. "nsfw" => [
  258. "display" => "NSFW",
  259. "option" => [
  260. "yes" => "Yes", // safe=active
  261. "no" => "No" // safe=off
  262. ]
  263. ]
  264. ];
  265. switch($page){
  266. case "web":
  267. return array_merge(
  268. $base,
  269. [
  270. "lang" => [ // lr=<lang> (prefix lang with "lang_")
  271. "display" => "Language",
  272. "option" => [
  273. "any" => "Any language",
  274. "ar" => "Arabic",
  275. "bg" => "Bulgarian",
  276. "ca" => "Catalan",
  277. "cs" => "Czech",
  278. "da" => "Danish",
  279. "de" => "German",
  280. "el" => "Greek",
  281. "en" => "English",
  282. "es" => "Spanish",
  283. "et" => "Estonian",
  284. "fi" => "Finnish",
  285. "fr" => "French",
  286. "hr" => "Croatian",
  287. "hu" => "Hungarian",
  288. "id" => "Indonesian",
  289. "is" => "Icelandic",
  290. "it" => "Italian",
  291. "iw" => "Hebrew",
  292. "ja" => "Japanese",
  293. "ko" => "Korean",
  294. "lt" => "Lithuanian",
  295. "lv" => "Latvian",
  296. "nl" => "Dutch",
  297. "no" => "Norwegian",
  298. "pl" => "Polish",
  299. "pt" => "Portuguese",
  300. "ro" => "Romanian",
  301. "ru" => "Russian",
  302. "sk" => "Slovak",
  303. "sl" => "Slovenian",
  304. "sr" => "Serbian",
  305. "sv" => "Swedish",
  306. "tr" => "Turkish",
  307. "zh-CN" => "Chinese (Simplified)",
  308. "zh-TW" => "Chinese (Traditional)"
  309. ]
  310. ],
  311. "newer" => [ // tbs
  312. "display" => "Newer than",
  313. "option" => "_DATE"
  314. ],
  315. "older" => [
  316. "display" => "Older than",
  317. "option" => "_DATE"
  318. ],
  319. "spellcheck" => [
  320. "display" => "Spellcheck",
  321. "option" => [
  322. "yes" => "Yes",
  323. "no" => "No"
  324. ]
  325. ]
  326. ]
  327. );
  328. break;
  329. case "images":
  330. return array_merge(
  331. $base,
  332. [
  333. "time" => [ // tbs=qdr:<time>
  334. "display" => "Time posted",
  335. "option" => [
  336. "any" => "Any time",
  337. "d" => "Past 24 hours",
  338. "w" => "Past week",
  339. "m" => "Past month",
  340. "y" => "Past year"
  341. ]
  342. ],
  343. "size" => [ // imgsz
  344. "display" => "Size",
  345. "option" => [
  346. "any" => "Any size",
  347. "l" => "Large",
  348. "m" => "Medium",
  349. "i" => "Icon",
  350. "qsvga" => "Larger than 400x300",
  351. "vga" => "Larger than 640x480",
  352. "svga" => "Larger than 800x600",
  353. "xga" => "Larger than 1024x768",
  354. "2mp" => "Larger than 2MP",
  355. "4mp" => "Larger than 4MP",
  356. "6mp" => "Larger than 6MP",
  357. "8mp" => "Larger than 8MP",
  358. "10mp" => "Larger than 10MP",
  359. "12mp" => "Larger than 12MP",
  360. "15mp" => "Larger than 15MP",
  361. "20mp" => "Larger than 20MP",
  362. "40mp" => "Larger than 40MP",
  363. "70mp" => "Larger than 70MP"
  364. ]
  365. ],
  366. "ratio" => [ // imgar
  367. "display" => "Aspect ratio",
  368. "option" => [
  369. "any" => "Any ratio",
  370. "t|xt" => "Tall",
  371. "s" => "Square",
  372. "w" => "Wide",
  373. "xw" => "Panoramic"
  374. ]
  375. ],
  376. "color" => [ // imgc
  377. "display" => "Color",
  378. "option" => [
  379. "any" => "Any color",
  380. "color" => "Full color",
  381. "bnw" => "Black & white",
  382. "trans" => "Transparent",
  383. // from here, imgcolor
  384. "red" => "Red",
  385. "orange" => "Orange",
  386. "yellow" => "Yellow",
  387. "green" => "Green",
  388. "teal" => "Teal",
  389. "blue" => "Blue",
  390. "purple" => "Purple",
  391. "pink" => "Pink",
  392. "white" => "White",
  393. "gray" => "Gray",
  394. "black" => "Black",
  395. "brown" => "Brown"
  396. ]
  397. ],
  398. "type" => [ // tbs=itp:<type>
  399. "display" => "Type",
  400. "option" => [
  401. "any" => "Any type",
  402. "clipart" => "Clip Art",
  403. "lineart" => "Line Drawing",
  404. "animated" => "Animated"
  405. ]
  406. ],
  407. "format" => [ // as_filetype
  408. "display" => "Format",
  409. "option" => [
  410. "any" => "Any format",
  411. "jpg" => "JPG",
  412. "gif" => "GIF",
  413. "png" => "PNG",
  414. "bmp" => "BMP",
  415. "svg" => "SVG",
  416. "webp" => "WEBP",
  417. "ico" => "ICO",
  418. "craw" => "RAW"
  419. ]
  420. ],
  421. "rights" => [ // tbs=sur:<rights>
  422. "display" => "Usage rights",
  423. "option" => [
  424. "any" => "Any license",
  425. "cl" => "Creative Commons licenses",
  426. "ol" => "Commercial & other licenses"
  427. ]
  428. ]
  429. ]
  430. );
  431. break;
  432. case "videos":
  433. return array_merge(
  434. $base,
  435. [
  436. "newer" => [ // tbs
  437. "display" => "Newer than",
  438. "option" => "_DATE"
  439. ],
  440. "older" => [
  441. "display" => "Older than",
  442. "option" => "_DATE"
  443. ],
  444. "duration" => [
  445. "display" => "Duration",
  446. "option" => [
  447. "any" => "Any duration",
  448. "s" => "Short (0-4min)", // tbs=dur:s
  449. "m" => "Medium (4-20min)", // tbs=dur:m
  450. "l" => "Long (20+ min)" // tbs=dur:l
  451. ]
  452. ],
  453. "quality" => [
  454. "display" => "Quality",
  455. "option" => [
  456. "any" => "Any quality",
  457. "h" => "High quality" // tbs=hq:h
  458. ]
  459. ],
  460. "captions" => [
  461. "display" => "Captions",
  462. "option" => [
  463. "any" => "No preference",
  464. "yes" => "Closed captioned" // tbs=cc:1
  465. ]
  466. ]
  467. ]
  468. );
  469. break;
  470. case "news":
  471. return array_merge(
  472. $base,
  473. [
  474. "newer" => [ // tbs
  475. "display" => "Newer than",
  476. "option" => "_DATE"
  477. ],
  478. "older" => [
  479. "display" => "Older than",
  480. "option" => "_DATE"
  481. ],
  482. "sort" => [
  483. "display" => "Sort",
  484. "option" => [
  485. "relevance" => "Relevance",
  486. "date" => "Date" // sbd:1
  487. ]
  488. ]
  489. ]
  490. );
  491. break;
  492. }
  493. }
  494. private function get($proxy, $url, $get = []){
  495. $headers = [
  496. "User-Agent: " . config::USER_AGENT,
  497. "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
  498. "Accept-Language: en-US,en;q=0.5",
  499. "Accept-Encoding: gzip",
  500. "DNT: 1",
  501. //"Cookie: SOCS=CAESNQgCEitib3FfaWRlbnRpdHlmcm9udGVuZHVpc2VydmVyXzIwMjQwMzE3LjA4X3AwGgJlbiAEGgYIgM7orwY",
  502. "Connection: keep-alive",
  503. "Upgrade-Insecure-Requests: 1",
  504. "Sec-Fetch-Dest: document",
  505. "Sec-Fetch-Mode: navigate",
  506. "Sec-Fetch-Site: none",
  507. "Sec-Fetch-User: ?1",
  508. "Priority: u=1",
  509. "TE: trailers"
  510. ];
  511. $curlproc = curl_init();
  512. if($get !== []){
  513. $get = http_build_query($get);
  514. $url .= "?" . $get;
  515. }
  516. curl_setopt($curlproc, CURLOPT_URL, $url);
  517. curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
  518. curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers);
  519. // use http2
  520. curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
  521. curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
  522. curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
  523. curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
  524. curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
  525. curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
  526. // follow redirects
  527. curl_setopt($curlproc, CURLOPT_FOLLOWLOCATION, true);
  528. $this->backend->assign_proxy($curlproc, $proxy);
  529. $data = curl_exec($curlproc);
  530. if(curl_errno($curlproc)){
  531. throw new Exception(curl_error($curlproc));
  532. }
  533. curl_close($curlproc);
  534. return $data;
  535. }
  536. private function parsepage($html, $pagetype, $search, $proxy, $params){
  537. $out = [
  538. "status" => "ok",
  539. "spelling" => [
  540. "type" => "no_correction",
  541. "using" => null,
  542. "correction" => null
  543. ],
  544. "npt" => null,
  545. "answer" => [],
  546. "web" => [],
  547. "image" => [],
  548. "video" => [],
  549. "news" => [],
  550. "related" => []
  551. ];
  552. $this->fuckhtml->load($html);
  553. $this->detect_sorry();
  554. // parse all <style> tags
  555. $this->parsestyles();
  556. // get javascript images
  557. $this->scrape_dimg($html);
  558. // get html blobs
  559. preg_match_all(
  560. '/function\(\){window\.jsl\.dh\(\'([^\']+?)\',\'(.+?[^\'])\'\);/',
  561. $html,
  562. $blobs
  563. );
  564. $this->blobs = [];
  565. if(isset($blobs[1])){
  566. for($i=0; $i<count($blobs[1]); $i++){
  567. $this->blobs[$blobs[1][$i]] =
  568. $this->fuckhtml
  569. ->parseJsString(
  570. $blobs[2][$i]
  571. );
  572. }
  573. }
  574. $this->scrape_imagearr($html);
  575. //
  576. // load result column
  577. //
  578. $result_div =
  579. $this->fuckhtml
  580. ->getElementById(
  581. "center_col",
  582. "div"
  583. );
  584. if($result_div === false){
  585. throw new Exception("Failed to grep result div");
  586. }
  587. $this->fuckhtml->load($result_div);
  588. //
  589. // Get word corrections
  590. //
  591. $correction =
  592. $this->fuckhtml
  593. ->getElementById(
  594. "fprs",
  595. "p"
  596. );
  597. if($correction){
  598. $this->fuckhtml->load($correction);
  599. $a =
  600. $this->fuckhtml
  601. ->getElementsByTagName(
  602. "a"
  603. );
  604. $using =
  605. $this->fuckhtml
  606. ->getElementById(
  607. "fprsl",
  608. $a
  609. );
  610. if($using){
  611. $using =
  612. $this->fuckhtml
  613. ->getTextContent(
  614. $using
  615. );
  616. $spans =
  617. $this->fuckhtml
  618. ->getElementsByTagName(
  619. "span"
  620. );
  621. $type_span =
  622. $this->fuckhtml
  623. ->getTextContent(
  624. $spans[0]
  625. );
  626. $type = "not_many";
  627. if(
  628. stripos(
  629. $type_span,
  630. "Showing results for"
  631. ) !== false
  632. ){
  633. $type = "including";
  634. }
  635. $correction =
  636. $this->fuckhtml
  637. ->getTextContent(
  638. $a[count($a) - 1]
  639. );
  640. $out["spelling"] = [
  641. "type" => $type,
  642. "using" => $using,
  643. "correction" => $correction
  644. ];
  645. }
  646. // reset
  647. $this->fuckhtml->load($result_div);
  648. }else{
  649. // get the "Did you mean?" prompt
  650. $taw =
  651. $this->fuckhtml
  652. ->getElementById(
  653. "taw"
  654. );
  655. if($taw){
  656. $this->fuckhtml->load($taw);
  657. $as =
  658. $this->fuckhtml
  659. ->getElementsByTagName(
  660. "a"
  661. );
  662. if(count($as) !== 0){
  663. $text =
  664. $this->fuckhtml
  665. ->getTextContent(
  666. $as[0]
  667. );
  668. // @TODO implement did_you_mean
  669. $out["spelling"] = [
  670. "type" => "including",
  671. "using" => $search,
  672. "correction" => $text
  673. ];
  674. }
  675. }
  676. $this->fuckhtml->load($result_div);
  677. }
  678. //
  679. // get notices
  680. //
  681. $botstuff =
  682. $this->fuckhtml
  683. ->getElementById(
  684. "botstuff"
  685. );
  686. // important for later
  687. $last_page = false;
  688. if($botstuff){
  689. $this->fuckhtml->load($botstuff);
  690. $cards =
  691. $this->fuckhtml
  692. ->getElementsByClassName(
  693. $this->getstyle(
  694. [
  695. "line-height" => "normal"
  696. ]
  697. ),
  698. "div"
  699. );
  700. foreach($cards as $card){
  701. $this->fuckhtml->load($card);
  702. $h2 =
  703. $this->fuckhtml
  704. ->getElementsByTagName(
  705. "h2"
  706. );
  707. if(count($h2) !== 0){
  708. $title =
  709. $this->fuckhtml
  710. ->getTextContent(
  711. $h2[0]
  712. );
  713. $card["innerHTML"] =
  714. str_replace(
  715. $h2[0]["outerHTML"],
  716. "",
  717. $card["innerHTML"]
  718. );
  719. }else{
  720. $title = "Notice";
  721. }
  722. $div =
  723. $this->fuckhtml
  724. ->getElementsByTagName(
  725. "div"
  726. );
  727. // probe for related searches div, if found, ignore it cause its shit
  728. $probe =
  729. $this->fuckhtml
  730. ->getElementsByAttributeValue(
  731. "role",
  732. "list",
  733. $div
  734. );
  735. // also probe for children
  736. if(count($probe) === 0){
  737. $probe =
  738. $this->fuckhtml
  739. ->getElementsByClassName(
  740. $this->getstyle(
  741. [
  742. "flex-shrink" => "0",
  743. "-moz-box-flex" => "0",
  744. "flex-grow" => "0",
  745. "overflow" => "hidden"
  746. ]
  747. ),
  748. $div
  749. );
  750. }
  751. if(count($probe) === 0){
  752. $description = [];
  753. $as =
  754. $this->fuckhtml
  755. ->getElementsByTagName(
  756. "a"
  757. );
  758. if(count($as) !== 0){
  759. $first = true;
  760. foreach($as as $a){
  761. $text_link =
  762. $this->fuckhtml
  763. ->getTextContent(
  764. $a
  765. );
  766. if(stripos($text_link, "repeat the search") !== false){
  767. $last_page = true;
  768. break 2;
  769. }
  770. $parts =
  771. explode(
  772. $a["outerHTML"],
  773. $card["innerHTML"],
  774. 2
  775. );
  776. $card["innerHTML"] = $parts[1];
  777. $value =
  778. preg_replace(
  779. '/ +/',
  780. " ",
  781. $this->fuckhtml
  782. ->getTextContent(
  783. $parts[0],
  784. false,
  785. false
  786. )
  787. );
  788. if(strlen(trim($value)) !== 0){
  789. $description[] = [
  790. "type" => "text",
  791. "value" => $value
  792. ];
  793. if($first){
  794. $description[0]["value"] =
  795. ltrim($description[0]["value"]);
  796. }
  797. }
  798. $first = false;
  799. $description[] = [
  800. "type" => "link",
  801. "url" =>
  802. $this->fuckhtml
  803. ->getTextContent(
  804. $a["attributes"]
  805. ["href"]
  806. ),
  807. "value" => $text_link
  808. ];
  809. }
  810. $text =
  811. $this->fuckhtml
  812. ->getTextContent(
  813. $card["innerHTML"],
  814. false,
  815. false
  816. );
  817. if(strlen(trim($text)) !== 0){
  818. $description[] = [
  819. "type" => "text",
  820. "value" =>
  821. rtrim(
  822. $text
  823. )
  824. ];
  825. }
  826. }
  827. if(count($description) !== 0){
  828. $out["answer"][] = [
  829. "title" => $title,
  830. "description" => $description,
  831. "url" => null,
  832. "thumb" => null,
  833. "table" => [],
  834. "sublink" => []
  835. ];
  836. }
  837. }
  838. }
  839. // reset
  840. $this->fuckhtml->load($html);
  841. }
  842. //
  843. // get "Related Searches" and "People also search for"
  844. //
  845. $relateds =
  846. $this->fuckhtml
  847. ->getElementsByClassName(
  848. "wyccme",
  849. "div"
  850. );
  851. foreach($relateds as $related){
  852. $text =
  853. $this->fuckhtml
  854. ->getTextContent(
  855. $related
  856. );
  857. if($text == "More results"){ continue; }
  858. $out["related"][] = $text;
  859. }
  860. //
  861. // Get text results
  862. //
  863. $results =
  864. $this->fuckhtml
  865. ->getElementsByClassName(
  866. "g",
  867. "div"
  868. );
  869. $this->skip_next = false;
  870. foreach($results as $result){
  871. if($this->skip_next){
  872. $this->skip_next = false;
  873. continue;
  874. }
  875. $this->fuckhtml->load($result);
  876. $web = [
  877. "title" => null,
  878. "description" => null,
  879. "url" => null,
  880. "date" => null,
  881. "type" => "web",
  882. "thumb" => [
  883. "url" => null,
  884. "ratio" => null
  885. ],
  886. "sublink" => [],
  887. "table" => []
  888. ];
  889. // Detect presence of sublinks
  890. $g =
  891. $this->fuckhtml
  892. ->getElementsByClassName(
  893. "g",
  894. "div"
  895. );
  896. $sublinks = [];
  897. if(count($g) > 0){
  898. $table =
  899. $this->fuckhtml
  900. ->getElementsByTagName(
  901. "table"
  902. );
  903. if(count($table) !== 0){
  904. // found some sublinks!
  905. $this->fuckhtml->load($table[0]);
  906. $tds =
  907. $this->fuckhtml
  908. ->getElementsByTagName(
  909. "td"
  910. );
  911. foreach($tds as $td){
  912. $this->fuckhtml->load($td);
  913. $a =
  914. $this->fuckhtml
  915. ->getElementsByTagName(
  916. "a"
  917. );
  918. if(
  919. count($a) === 0 ||
  920. (
  921. isset($a[0]["attributes"]["class"]) &&
  922. $a[0]["attributes"]["class"] == "fl"
  923. )
  924. ){
  925. continue;
  926. }
  927. $td["innerHTML"] =
  928. str_replace(
  929. $a[0]["outerHTML"],
  930. "",
  931. $td["innerHTML"]
  932. );
  933. $web["sublink"][] = [
  934. "title" =>
  935. $this->titledots(
  936. $this->fuckhtml
  937. ->getTextContent(
  938. $a[0]
  939. )
  940. ),
  941. "description" =>
  942. html_entity_decode(
  943. $this->titledots(
  944. $this->fuckhtml
  945. ->getTextContent(
  946. $td
  947. )
  948. )
  949. ),
  950. "url" =>
  951. $this->unshiturl(
  952. $a[0]
  953. ["attributes"]
  954. ["href"]
  955. ),
  956. "date" => null
  957. ];
  958. }
  959. // reset
  960. $this->fuckhtml->load($result);
  961. }
  962. // skip on next iteration
  963. $this->skip_next = true;
  964. }
  965. // get title
  966. $h3 =
  967. $this->fuckhtml
  968. ->getElementsByTagName(
  969. "h3"
  970. );
  971. if(count($h3) === 0){
  972. continue;
  973. }
  974. $web["title"] =
  975. $this->titledots(
  976. $this->fuckhtml
  977. ->getTextContent(
  978. $h3[0]
  979. )
  980. );
  981. // get url
  982. $as =
  983. $this->fuckhtml
  984. ->getElementsByTagName(
  985. "a"
  986. );
  987. $web["url"] =
  988. $this->unshiturl(
  989. $as[0]
  990. ["attributes"]
  991. ["href"]
  992. );
  993. if(
  994. !preg_match(
  995. '/^http/',
  996. $web["url"]
  997. )
  998. ){
  999. // skip if invalid url is found
  1000. continue;
  1001. }
  1002. //
  1003. // probe for twitter carousel
  1004. //
  1005. $carousel =
  1006. $this->fuckhtml
  1007. ->getElementsByTagName(
  1008. "g-scrolling-carousel"
  1009. );
  1010. if(count($carousel) !== 0){
  1011. $this->fuckhtml->load($carousel[0]);
  1012. $items =
  1013. $this->fuckhtml
  1014. ->getElementsByTagName(
  1015. "g-inner-card"
  1016. );
  1017. $has_thumbnail = false;
  1018. foreach($items as $item){
  1019. $this->fuckhtml->load($item);
  1020. if($has_thumbnail === false){
  1021. // get thumbnail
  1022. $thumb =
  1023. $this->fuckhtml
  1024. ->getElementsByTagName(
  1025. "img"
  1026. );
  1027. if(
  1028. count($thumb) !== 0 &&
  1029. isset($thumb[0]["attributes"]["id"])
  1030. ){
  1031. $web["thumb"] = [
  1032. "url" =>
  1033. $this->getdimg(
  1034. $thumb[0]["attributes"]["id"]
  1035. ),
  1036. "ratio" => "16:9"
  1037. ];
  1038. $has_thumbnail = true;
  1039. }
  1040. // or else, try getting a thumbnail from next container
  1041. }
  1042. // cache div
  1043. $div =
  1044. $this->fuckhtml
  1045. ->getElementsByTagName(
  1046. "div"
  1047. );
  1048. // get link
  1049. $links =
  1050. $this->fuckhtml
  1051. ->getElementsByTagName(
  1052. "a"
  1053. );
  1054. // get description of carousel sublink
  1055. $description =
  1056. $this->fuckhtml
  1057. ->getElementsByAttributeValue(
  1058. "role",
  1059. "heading",
  1060. $div
  1061. );
  1062. if(count($description) !== 0){
  1063. $description =
  1064. $this->titledots(
  1065. $this->fuckhtml
  1066. ->getTextContent(
  1067. $description[0]
  1068. )
  1069. );
  1070. }else{
  1071. $description = null;
  1072. }
  1073. $bottom =
  1074. $this->fuckhtml
  1075. ->getElementsByAttributeValue(
  1076. "style",
  1077. "z-index:2",
  1078. $div
  1079. );
  1080. $title = null;
  1081. $date = null;
  1082. if(count($bottom) !== 0){
  1083. $this->fuckhtml->load($bottom[0]);
  1084. $spans =
  1085. $this->fuckhtml
  1086. ->getElementsByTagName(
  1087. "span"
  1088. );
  1089. $title =
  1090. $this->fuckhtml
  1091. ->getTextContent(
  1092. $spans[0]
  1093. );
  1094. $date =
  1095. strtotime(
  1096. $this->fuckhtml
  1097. ->getTextContent(
  1098. $spans[count($spans) - 1]
  1099. )
  1100. );
  1101. }
  1102. $web["sublink"][] = [
  1103. "title" => $title,
  1104. "description" => $description,
  1105. "url" =>
  1106. $this->unshiturl(
  1107. $links[0]
  1108. ["attributes"]
  1109. ["href"]
  1110. ),
  1111. "date" => $date
  1112. ];
  1113. }
  1114. $out["web"][] = $web;
  1115. continue;
  1116. }
  1117. //
  1118. // get viewcount, time posted and follower count from <cite> tag
  1119. //
  1120. $cite =
  1121. $this->fuckhtml
  1122. ->getElementsByTagName(
  1123. "cite"
  1124. );
  1125. if(count($cite) !== 0){
  1126. $this->fuckhtml->load($cite[0]);
  1127. $spans =
  1128. $this->fuckhtml
  1129. ->getElementsByTagName("span");
  1130. if(count($spans) === 0){
  1131. $cites =
  1132. explode(
  1133. "·",
  1134. $this->fuckhtml
  1135. ->getTextContent(
  1136. $cite[0]
  1137. )
  1138. );
  1139. foreach($cites as $cite){
  1140. $cite = trim($cite);
  1141. if(
  1142. preg_match(
  1143. '/(.+) (views|followers|likes)$/',
  1144. $cite,
  1145. $match
  1146. )
  1147. ){
  1148. $web["table"][ucfirst($match[2])] =
  1149. $match[1];
  1150. }elseif(
  1151. preg_match(
  1152. '/ago$/',
  1153. $cite
  1154. )
  1155. ){
  1156. $web["date"] =
  1157. strtotime($cite);
  1158. }
  1159. }
  1160. }
  1161. // reset
  1162. $this->fuckhtml->load($result);
  1163. }
  1164. //
  1165. // attempt to fetch description cleanly
  1166. //
  1167. $description =
  1168. $this->fuckhtml
  1169. ->getElementsByAttributeValue(
  1170. "style",
  1171. "-webkit-line-clamp:2"
  1172. );
  1173. if(count($description) !== 0){
  1174. $web["description"] =
  1175. $this->titledots(
  1176. $this->fuckhtml
  1177. ->getTextContent(
  1178. $description[0]
  1179. )
  1180. );
  1181. }else{
  1182. // use ANOTHER method where the description is a header of the result
  1183. $description =
  1184. $this->fuckhtml
  1185. ->getElementsByAttributeValue(
  1186. "data-attrid",
  1187. "wa:/description"
  1188. );
  1189. if(count($description) !== 0){
  1190. // get date off that shit
  1191. $date =
  1192. $this->fuckhtml
  1193. ->getElementsByClassName(
  1194. $this->getstyle(
  1195. [
  1196. "font-size" => "12px",
  1197. "line-height" => "1.34",
  1198. "display" => "inline-block",
  1199. "font-family" => "google sans,arial,sans-serif",
  1200. "padding-right" => "0",
  1201. "white-space" => "nowrap"
  1202. ]
  1203. ),
  1204. "span"
  1205. );
  1206. if(count($date) !== 0){
  1207. $description[0]["innerHTML"] =
  1208. str_replace(
  1209. $date[0]["outerHTML"],
  1210. "",
  1211. $description[0]["innerHTML"]
  1212. );
  1213. $web["date"] =
  1214. strtotime(
  1215. $this->fuckhtml
  1216. ->getTextContent(
  1217. $date[0]
  1218. )
  1219. );
  1220. }
  1221. $web["description"] =
  1222. $this->fuckhtml
  1223. ->getTextContent(
  1224. $description[0]
  1225. );
  1226. }else{
  1227. // Yes.. You guessed it, use ANOTHER method to get descriptions
  1228. // off youtube containers
  1229. $description =
  1230. $this->fuckhtml
  1231. ->getElementsByClassName(
  1232. $this->getstyle(
  1233. [
  1234. "-webkit-box-orient" => "vertical",
  1235. "display" => "-webkit-box",
  1236. "font-size" => "14px",
  1237. "-webkit-line-clamp" => "2",
  1238. "line-height" => "22px",
  1239. "overflow" => "hidden",
  1240. "word-break" => "break-word",
  1241. "color" => "#4d5156"
  1242. ]
  1243. ),
  1244. "div"
  1245. );
  1246. if(count($description) !== 0){
  1247. // check for video duration
  1248. $duration =
  1249. $this->fuckhtml
  1250. ->getElementsByClassName(
  1251. $this->getstyle(
  1252. [
  1253. "background-color" => "rgba(0,0,0,0.6)",
  1254. "color" => "#fff",
  1255. "fill" => "#fff"
  1256. ]
  1257. ),
  1258. "div"
  1259. );
  1260. if(count($duration) !== 0){
  1261. $web["table"]["Duration"] =
  1262. $this->fuckhtml
  1263. ->getTextContent(
  1264. $duration[0]
  1265. );
  1266. }
  1267. $web["description"] =
  1268. $this->titledots(
  1269. html_entity_decode(
  1270. $this->fuckhtml
  1271. ->getTextContent(
  1272. $description[0]
  1273. )
  1274. )
  1275. );
  1276. // get author + time posted
  1277. $info =
  1278. $this->fuckhtml
  1279. ->getElementsByClassName(
  1280. $this->getstyle(
  1281. [
  1282. "color" => "var(" . $this->getcolorvar("#70757a") . ")",
  1283. "font-size" => "14px",
  1284. "line-height" => "20px",
  1285. "margin-top" => "12px"
  1286. ]
  1287. ),
  1288. "div"
  1289. );
  1290. if(count($info) !== 0){
  1291. $info =
  1292. explode(
  1293. "·",
  1294. $this->fuckhtml
  1295. ->getTextContent(
  1296. $info[0]
  1297. )
  1298. );
  1299. switch(count($info)){
  1300. case 3:
  1301. $web["table"]["Author"] = trim($info[1]);
  1302. $web["date"] = strtotime(trim($info[2]));
  1303. break;
  1304. case 2:
  1305. $web["date"] = strtotime(trim($info[1]));
  1306. break;
  1307. }
  1308. }
  1309. }
  1310. }
  1311. }
  1312. //
  1313. // get categories of content within the search result
  1314. //
  1315. $cats =
  1316. $this->fuckhtml
  1317. ->getElementsByAttributeName(
  1318. "data-sncf",
  1319. "div"
  1320. );
  1321. foreach($cats as $cat){
  1322. $this->fuckhtml->load($cat);
  1323. // detect image category
  1324. $images =
  1325. $this->fuckhtml
  1326. ->getElementsByTagName(
  1327. "img"
  1328. );
  1329. if(count($images) !== 0){
  1330. foreach($images as $image){
  1331. if(isset($image["attributes"]["id"])){
  1332. // we found an image
  1333. if(isset($image["attributes"]["width"])){
  1334. $width = (int)$image["attributes"]["width"];
  1335. if($width == 110){
  1336. $ratio = "1:1";
  1337. }elseif($width > 110){
  1338. $ratio = "16:9";
  1339. }else{
  1340. $ratio = "9:16";
  1341. }
  1342. }else{
  1343. $ratio = "1:1";
  1344. }
  1345. $web["thumb"] = [
  1346. "url" => $this->getdimg($image["attributes"]["id"]),
  1347. "ratio" => $ratio
  1348. ];
  1349. continue 2;
  1350. }
  1351. }
  1352. }
  1353. // Detect rating
  1354. $spans_unfiltered =
  1355. $this->fuckhtml
  1356. ->getElementsByTagName(
  1357. "span"
  1358. );
  1359. $spans =
  1360. $this->fuckhtml
  1361. ->getElementsByAttributeName(
  1362. "aria-label",
  1363. $spans_unfiltered
  1364. );
  1365. foreach($spans as $span){
  1366. if(
  1367. preg_match(
  1368. '/^Rated/',
  1369. $span["attributes"]["aria-label"]
  1370. )
  1371. ){
  1372. // found rating
  1373. // scrape rating
  1374. preg_match(
  1375. '/([0-9.]+).*([0-9.]+)/',
  1376. $span["attributes"]["aria-label"],
  1377. $rating
  1378. );
  1379. if(isset($rating[1])){
  1380. $web["table"]["Rating"] =
  1381. $rating[1] . "/" . $rating[2];
  1382. }
  1383. $has_seen_reviews = 0;
  1384. foreach($spans_unfiltered as $span_unfiltered){
  1385. if(
  1386. preg_match(
  1387. '/([0-9,.]+) +([A-z]+)$/',
  1388. $this->fuckhtml
  1389. ->getTextContent(
  1390. $span_unfiltered
  1391. ),
  1392. $votes
  1393. )
  1394. ){
  1395. $has_seen_reviews++;
  1396. $web["table"][ucfirst($votes[2])] = $votes[1];
  1397. continue;
  1398. }
  1399. $text =
  1400. $this->fuckhtml
  1401. ->getTextContent(
  1402. $span_unfiltered
  1403. );
  1404. if(
  1405. $text == "&nbsp;&nbsp;&nbsp;" ||
  1406. $text == ""
  1407. ){
  1408. break;
  1409. }
  1410. switch($has_seen_reviews){
  1411. case 1:
  1412. // scrape price
  1413. $web["table"]["Price"] = $text;
  1414. $has_seen_reviews++;
  1415. break;
  1416. case 2:
  1417. // scrape platform
  1418. $web["table"]["Platform"] = $text;
  1419. $has_seen_reviews++;
  1420. break;
  1421. case 3:
  1422. // Scrape type
  1423. $web["table"]["Medium"] = $text;
  1424. break;
  1425. }
  1426. }
  1427. continue 2;
  1428. }
  1429. }
  1430. // check if its a table of small sublinks
  1431. $table =
  1432. $this->fuckhtml
  1433. ->getElementsByClassName(
  1434. $this->getstyle(
  1435. [
  1436. "display" => "table",
  1437. "white-space" => "nowrap",
  1438. "margin" => "5px 0",
  1439. "line-height" => "1.58",
  1440. "color" => "var(" . $this->getcolorvar("#70757a") . ")"
  1441. ]
  1442. ),
  1443. "div"
  1444. );
  1445. if(count($table) !== 0){
  1446. $this->fuckhtml->load($table[0]);
  1447. $rows =
  1448. $this->fuckhtml
  1449. ->getElementsByClassName(
  1450. $this->getstyle(
  1451. [
  1452. "display" => "flex",
  1453. "white-space" => "normal"
  1454. ]
  1455. ),
  1456. "div"
  1457. );
  1458. foreach($rows as $row){
  1459. $this->fuckhtml->load($row);
  1460. $sublink = [
  1461. "title" => null,
  1462. "description" => null,
  1463. "url" => null,
  1464. "date" => null
  1465. ];
  1466. $link =
  1467. $this->fuckhtml
  1468. ->getElementsByTagName(
  1469. "a"
  1470. )[0];
  1471. $sublink["title"] =
  1472. $this->titledots(
  1473. $this->fuckhtml
  1474. ->getTextContent(
  1475. $link
  1476. )
  1477. );
  1478. $sublink["url"] =
  1479. $this->unshiturl(
  1480. $link
  1481. ["attributes"]
  1482. ["href"]
  1483. );
  1484. $row["innerHTML"] =
  1485. str_replace(
  1486. $link["outerHTML"],
  1487. "",
  1488. $row["innerHTML"]
  1489. );
  1490. $this->fuckhtml->load($row);
  1491. $spans =
  1492. $this->fuckhtml
  1493. ->getElementsByTagName(
  1494. "span"
  1495. );
  1496. foreach($spans as $span){
  1497. $text =
  1498. $this->fuckhtml
  1499. ->getTextContent(
  1500. $span
  1501. );
  1502. if(
  1503. preg_match(
  1504. '/answers?$/',
  1505. $text
  1506. )
  1507. ){
  1508. $sublink["description"] =
  1509. $text;
  1510. continue;
  1511. }
  1512. $time = strtotime($text);
  1513. if($time !== false){
  1514. $sublink["date"] = $time;
  1515. }
  1516. }
  1517. $web["sublink"][] = $sublink;
  1518. }
  1519. // reset
  1520. $this->fuckhtml->load($cat);
  1521. continue;
  1522. }
  1523. // check if its an answer header
  1524. $answer_header =
  1525. $this->fuckhtml
  1526. ->getElementsByClassName(
  1527. $this->getstyle(
  1528. [
  1529. "overflow" => "hidden",
  1530. "text-overflow" => "ellipsis"
  1531. ]
  1532. ),
  1533. "span"
  1534. );
  1535. if(count($answer_header) !== 0){
  1536. $link =
  1537. $this->fuckhtml
  1538. ->getElementsByTagName(
  1539. "a"
  1540. );
  1541. $cat["innerHTML"] =
  1542. str_replace(
  1543. $link[0]["outerHTML"],
  1544. "",
  1545. $cat["innerHTML"]
  1546. );
  1547. $web["sublink"][] = [
  1548. "title" =>
  1549. $this->fuckhtml
  1550. ->getTextContent(
  1551. $link[0]
  1552. ),
  1553. "description" =>
  1554. $this->titledots(
  1555. trim(
  1556. str_replace(
  1557. "\xc2\xa0",
  1558. " ",
  1559. html_entity_decode(
  1560. $this->fuckhtml
  1561. ->getTextContent(
  1562. $cat
  1563. )
  1564. )
  1565. ),
  1566. " ·"
  1567. )
  1568. ),
  1569. "url" =>
  1570. $this->fuckhtml
  1571. ->getTextContent(
  1572. $link[0]
  1573. ["attributes"]
  1574. ["href"]
  1575. ),
  1576. "date" => null
  1577. ];
  1578. continue;
  1579. }
  1580. // check if its list of small sublinks
  1581. $urls =
  1582. $this->fuckhtml
  1583. ->getElementsByTagName(
  1584. "a"
  1585. );
  1586. if(count($urls) !== 0){
  1587. // found small links
  1588. foreach($urls as $url){
  1589. $target =
  1590. $this->fuckhtml
  1591. ->getTextContent(
  1592. $url
  1593. ["attributes"]
  1594. ["href"]
  1595. );
  1596. if(
  1597. !preg_match(
  1598. '/^http/',
  1599. $target
  1600. )
  1601. ){
  1602. continue;
  1603. }
  1604. $web["sublink"][] = [
  1605. "title" =>
  1606. $this->titledots(
  1607. $this->fuckhtml
  1608. ->getTextContent(
  1609. $url
  1610. )
  1611. ),
  1612. "description" => null,
  1613. "url" => $target,
  1614. "date" => null
  1615. ];
  1616. }
  1617. continue;
  1618. }
  1619. // we probed everything, assume this is the description
  1620. // if we didn't find one cleanly previously
  1621. if($web["description"] === null){
  1622. $web["description"] =
  1623. $this->titledots(
  1624. $this->fuckhtml
  1625. ->getTextContent(
  1626. $cat
  1627. )
  1628. );
  1629. }
  1630. }
  1631. // check if description contains date
  1632. $description = explode("—", $web["description"], 2);
  1633. if(
  1634. count($description) === 2 &&
  1635. strlen($description[0]) <= 20
  1636. ){
  1637. $date = strtotime($description[0]);
  1638. if($date !== false){
  1639. $web["date"] = $date;
  1640. $web["description"] = ltrim($description[1]);
  1641. }
  1642. }
  1643. // fetch youtube thumbnail
  1644. $thumbnail =
  1645. $this->fuckhtml
  1646. ->getElementsByClassName(
  1647. $this->getstyle(
  1648. [
  1649. "border-radius" => "8px",
  1650. "height" => "fit-content",
  1651. "justify-content" => "center",
  1652. "margin-right" => "20px",
  1653. "margin-top" => "4px",
  1654. "position" => "relative",
  1655. "width" => "fit-content"
  1656. ]
  1657. ),
  1658. "div"
  1659. );
  1660. if(count($thumbnail) !== 0){
  1661. // load thumbnail container
  1662. $this->fuckhtml->load($thumbnail[0]);
  1663. $image =
  1664. $this->fuckhtml
  1665. ->getElementsByTagName(
  1666. "img"
  1667. );
  1668. if(
  1669. count($image) !== 0 &&
  1670. isset($image[0]["attributes"]["id"])
  1671. ){
  1672. $web["thumb"] = [
  1673. "url" =>
  1674. $this->unshit_thumb(
  1675. $this->getdimg(
  1676. $image[0]["attributes"]["id"]
  1677. )
  1678. ),
  1679. "ratio" => "16:9"
  1680. ];
  1681. }
  1682. // reset
  1683. $this->fuckhtml->load($result);
  1684. }
  1685. $out["web"][] = $web;
  1686. }
  1687. // reset
  1688. $this->fuckhtml->load($result_div);
  1689. //
  1690. // Get instant answers
  1691. //
  1692. $answer_containers =
  1693. $this->fuckhtml
  1694. ->getElementsByClassName(
  1695. $this->getstyle(
  1696. [
  1697. "padding-left" => "0px",
  1698. "padding-right" => "0px"
  1699. ]
  1700. ),
  1701. "div"
  1702. );
  1703. $date_class =
  1704. $this->getstyle(
  1705. [
  1706. "font-size" => "12px",
  1707. "line-height" => "1.34",
  1708. "display" => "inline-block",
  1709. "font-family" => "google sans,arial,sans-serif",
  1710. "padding-right" => "0",
  1711. "white-space" => "nowrap"
  1712. ]
  1713. );
  1714. foreach($answer_containers as $container){
  1715. $this->fuckhtml->load($container);
  1716. $web = [
  1717. "title" => null,
  1718. "description" => null,
  1719. "url" => null,
  1720. "date" => null,
  1721. "type" => "web",
  1722. "thumb" => [
  1723. "url" => null,
  1724. "ratio" => null
  1725. ],
  1726. "sublink" => [],
  1727. "table" => []
  1728. ];
  1729. $answers =
  1730. $this->fuckhtml
  1731. ->getElementsByAttributeName(
  1732. "aria-controls",
  1733. "div"
  1734. );
  1735. $item_insert_pos = 1;
  1736. foreach($answers as $answer){
  1737. $out["related"][] =
  1738. $this->fuckhtml
  1739. ->getTextContent(
  1740. $answer
  1741. );
  1742. if(
  1743. isset(
  1744. $this->blobs[
  1745. $answer
  1746. ["attributes"]
  1747. ["aria-controls"]
  1748. ]
  1749. )
  1750. ){
  1751. $this->fuckhtml->load(
  1752. $this->blobs[
  1753. $answer
  1754. ["attributes"]
  1755. ["aria-controls"]
  1756. ]
  1757. );
  1758. $divs =
  1759. $this->fuckhtml
  1760. ->getElementsByAttributeName(
  1761. "id",
  1762. "div"
  1763. );
  1764. foreach($divs as $div){
  1765. if(
  1766. !isset(
  1767. $this->blobs[
  1768. $div
  1769. ["attributes"]
  1770. ["id"]
  1771. ]
  1772. )
  1773. ){
  1774. continue;
  1775. }
  1776. $this->fuckhtml->load(
  1777. $this->blobs[
  1778. $div
  1779. ["attributes"]
  1780. ["id"]
  1781. ]
  1782. );
  1783. // get url
  1784. $as =
  1785. $this->fuckhtml
  1786. ->getElementsByTagName(
  1787. "a"
  1788. );
  1789. if(count($as) !== 0){
  1790. $web["url"] =
  1791. $this->unshiturl(
  1792. $as[0]["attributes"]["href"]
  1793. );
  1794. // skip entries that redirect to a search
  1795. if(
  1796. !preg_match(
  1797. '/^http/',
  1798. $web["url"]
  1799. )
  1800. ){
  1801. continue 3;
  1802. }
  1803. }
  1804. // get title
  1805. $h3 =
  1806. $this->fuckhtml
  1807. ->getElementsByTagName(
  1808. "h3"
  1809. );
  1810. if(count($h3) !== 0){
  1811. $web["title"] =
  1812. $this->titledots(
  1813. $this->fuckhtml
  1814. ->getTextContent(
  1815. $h3[0]
  1816. )
  1817. );
  1818. }
  1819. $description =
  1820. $this->fuckhtml
  1821. ->getElementsByAttributeValue(
  1822. "data-attrid",
  1823. "wa:/description",
  1824. "div"
  1825. );
  1826. if(count($description) !== 0){
  1827. // check for date
  1828. $this->fuckhtml->load($description[0]);
  1829. $date =
  1830. $this->fuckhtml
  1831. ->getElementsByClassName(
  1832. $date_class,
  1833. "span"
  1834. );
  1835. if(count($date) !== 0){
  1836. $description[0]["innerHTML"] =
  1837. str_replace(
  1838. $date[0]["outerHTML"],
  1839. "",
  1840. $description[0]["innerHTML"]
  1841. );
  1842. $web["date"] =
  1843. strtotime(
  1844. $this->fuckhtml
  1845. ->getTextContent(
  1846. $date[0]
  1847. )
  1848. );
  1849. }
  1850. $web["description"] =
  1851. ltrim(
  1852. $this->fuckhtml
  1853. ->getTextContent(
  1854. $description[0]
  1855. ),
  1856. ": "
  1857. );
  1858. }
  1859. }
  1860. foreach($out["web"] as $item){
  1861. if($item["url"] == $web["url"]){
  1862. continue 2;
  1863. }
  1864. }
  1865. array_splice($out["web"], $item_insert_pos, 0, [$web]);
  1866. $item_insert_pos++;
  1867. }
  1868. }
  1869. }
  1870. // reset
  1871. $this->fuckhtml->load($result_div);
  1872. //
  1873. // Scrape word definition
  1874. //
  1875. $definition_container =
  1876. $this->fuckhtml
  1877. ->getElementsByClassName(
  1878. "lr_container",
  1879. "div"
  1880. );
  1881. if(count($definition_container) !== 0){
  1882. $this->fuckhtml->load($definition_container[0]);
  1883. // get header
  1884. $header =
  1885. $this->fuckhtml
  1886. ->getElementsByAttributeValue(
  1887. "data-attrid",
  1888. "EntryHeader",
  1889. "div"
  1890. );
  1891. if(count($header) !== 0){
  1892. $description = [];
  1893. $this->fuckhtml->load($header[0]);
  1894. $title_div =
  1895. $this->fuckhtml
  1896. ->getElementsByClassName(
  1897. $this->getstyle(
  1898. [
  1899. "font-family" => "google sans,arial,sans-serif",
  1900. "font-size" => "28px",
  1901. "line-height" => "36px"
  1902. ]
  1903. )
  1904. );
  1905. if(count($title_div) !== 0){
  1906. $title =
  1907. $this->fuckhtml
  1908. ->getTextContent(
  1909. $title_div[0]
  1910. );
  1911. }else{
  1912. $title = "Word definition";
  1913. }
  1914. $subtext_div =
  1915. $this->fuckhtml
  1916. ->getElementsByClassName(
  1917. $this->getstyle(
  1918. [
  1919. "font-family" => "arial,sans-serif",
  1920. "font-size" => "14px",
  1921. "line-height" => "22px"
  1922. ]
  1923. ),
  1924. "span"
  1925. );
  1926. if(count($subtext_div) !== 0){
  1927. $description[] = [
  1928. "type" => "quote",
  1929. "value" =>
  1930. $this->fuckhtml
  1931. ->getTextContent(
  1932. $subtext_div[0]
  1933. )
  1934. ];
  1935. }
  1936. // get audio
  1937. $audio =
  1938. $this->fuckhtml
  1939. ->getElementsByTagName(
  1940. "audio"
  1941. );
  1942. if(count($audio) !== 0){
  1943. $this->fuckhtml->load($audio[0]);
  1944. $source =
  1945. $this->fuckhtml
  1946. ->getElementsByTagName(
  1947. "source"
  1948. );
  1949. if(count($source) !== 0){
  1950. $description[] = [
  1951. "type" => "audio",
  1952. "url" =>
  1953. preg_replace(
  1954. '/^\/\//',
  1955. "https://",
  1956. $this->fuckhtml
  1957. ->getTextContent(
  1958. $source[0]
  1959. ["attributes"]
  1960. ["src"]
  1961. )
  1962. )
  1963. ];
  1964. }
  1965. }
  1966. // remove header to avoid confusion
  1967. $definition_container[0]["innerHTML"] =
  1968. str_replace(
  1969. $header[0]["outerHTML"],
  1970. "",
  1971. $definition_container[0]["innerHTML"]
  1972. );
  1973. // reset
  1974. $this->fuckhtml->load($definition_container[0]);
  1975. $vmods =
  1976. $this->fuckhtml
  1977. ->getElementsByClassName(
  1978. "vmod",
  1979. "div"
  1980. );
  1981. foreach($vmods as $category){
  1982. if(
  1983. !isset(
  1984. $category
  1985. ["attributes"]
  1986. ["data-topic"]
  1987. ) ||
  1988. $category
  1989. ["attributes"]
  1990. ["class"] != "vmod"
  1991. ){
  1992. continue;
  1993. }
  1994. $this->fuckhtml->load($category);
  1995. // get category type
  1996. $type =
  1997. $this->fuckhtml
  1998. ->getElementsByTagName(
  1999. "i"
  2000. );
  2001. if(count($type) !== 0){
  2002. $description[] = [
  2003. "type" => "title",
  2004. "value" =>
  2005. $this->fuckhtml
  2006. ->getTextContent(
  2007. $type[0]
  2008. )
  2009. ];
  2010. }
  2011. // get heading text
  2012. $headings =
  2013. $this->fuckhtml
  2014. ->getElementsByClassName(
  2015. "xpdxpnd",
  2016. "div"
  2017. );
  2018. foreach($headings as $heading){
  2019. $description[] = [
  2020. "type" => "quote",
  2021. "value" =>
  2022. $this->fuckhtml
  2023. ->getTextContent(
  2024. $heading
  2025. )
  2026. ];
  2027. }
  2028. $definitions =
  2029. $this->fuckhtml
  2030. ->getElementsByAttributeValue(
  2031. "data-attrid",
  2032. "SenseDefinition",
  2033. "div"
  2034. );
  2035. $i = 1;
  2036. $text = [];
  2037. foreach($definitions as $definition){
  2038. $text[] =
  2039. $i . ". " .
  2040. $this->fuckhtml
  2041. ->getTextContent(
  2042. $definition
  2043. );
  2044. $i++;
  2045. }
  2046. if(count($text) !== 0){
  2047. $description[] = [
  2048. "type" => "text",
  2049. "value" =>
  2050. implode("\n", $text)
  2051. ];
  2052. }
  2053. }
  2054. $out["answer"][] = [
  2055. "title" => $title,
  2056. "description" => $description,
  2057. "url" => null,
  2058. "thumb" => null,
  2059. "table" => [],
  2060. "sublink" => []
  2061. ];
  2062. }
  2063. // reset
  2064. $this->fuckhtml->load($result_div);
  2065. }
  2066. //
  2067. // scrape elements with a g-section-with-header
  2068. // includes: images, news carousels
  2069. //
  2070. $g_sections =
  2071. $this->fuckhtml
  2072. ->getElementsByTagName(
  2073. "g-section-with-header"
  2074. );
  2075. if(count($g_sections) !== 0){
  2076. foreach($g_sections as $g_section){
  2077. // parse elements with a g-section-with-header
  2078. $this->fuckhtml->load($g_section);
  2079. $div_title =
  2080. $this->fuckhtml
  2081. ->getElementsByClassName(
  2082. "a-no-hover-decoration",
  2083. "a"
  2084. );
  2085. if(count($div_title) !== 0){
  2086. // title detected, skip
  2087. continue;
  2088. }
  2089. // no title detected: detect news container
  2090. $news =
  2091. $this->fuckhtml
  2092. ->getElementsByClassName(
  2093. $this->getstyle(
  2094. [
  2095. "outline-offset" => "-1px",
  2096. "outline-width" => "1px",
  2097. "display" => "flex",
  2098. "flex-direction" => "column",
  2099. "flex-grow" => "1"
  2100. ]
  2101. )
  2102. );
  2103. foreach($news as $new){
  2104. $this->fuckhtml->load($new);
  2105. $image =
  2106. $this->fuckhtml
  2107. ->getElementsByAttributeName(
  2108. "id",
  2109. "img"
  2110. );
  2111. if(
  2112. count($image) !== 0 &&
  2113. !(
  2114. isset($image[0]["attributes"]["style"]) &&
  2115. strpos(
  2116. $image[0]["attributes"]["style"],
  2117. "height:18px"
  2118. ) !== false
  2119. )
  2120. ){
  2121. $thumb = [
  2122. "url" =>
  2123. $this->getdimg(
  2124. $image[0]
  2125. ["attributes"]
  2126. ["id"]
  2127. ),
  2128. "ratio" => "1:1"
  2129. ];
  2130. }
  2131. $title =
  2132. $this->titledots(
  2133. $this->fuckhtml
  2134. ->getTextContent(
  2135. $this->fuckhtml
  2136. ->getElementsByAttributeValue(
  2137. "role",
  2138. "heading",
  2139. "div"
  2140. )[0]
  2141. )
  2142. );
  2143. $date_div =
  2144. $this->fuckhtml
  2145. ->getElementsByAttributeName(
  2146. "style",
  2147. "div"
  2148. );
  2149. $date = null;
  2150. if(count($date_div) !== 0){
  2151. foreach($date_div as $div){
  2152. if(
  2153. strpos(
  2154. $div["attributes"]["style"],
  2155. "bottom:"
  2156. ) !== false
  2157. ){
  2158. $date =
  2159. strtotime(
  2160. $this->fuckhtml
  2161. ->getTextContent(
  2162. $div
  2163. )
  2164. );
  2165. break;
  2166. }
  2167. }
  2168. }else{
  2169. $date = null;
  2170. }
  2171. $out["news"][] = [
  2172. "title" => $title,
  2173. "description" => null,
  2174. "date" => $date,
  2175. "thumb" => $thumb,
  2176. "url" =>
  2177. $this->fuckhtml
  2178. ->getTextContent(
  2179. $new
  2180. ["attributes"]
  2181. ["href"]
  2182. )
  2183. ];
  2184. }
  2185. }
  2186. // reset
  2187. $this->fuckhtml->load($result_div);
  2188. }
  2189. //
  2190. // Parse images (carousel, left hand-side)
  2191. //
  2192. $image_carousels =
  2193. $this->fuckhtml
  2194. ->getElementsByAttributeValue(
  2195. "id",
  2196. "media_result_group",
  2197. "div"
  2198. );
  2199. if(count($image_carousels) !== 0){
  2200. foreach($image_carousels as $image_carousel){
  2201. $this->fuckhtml->load($image_carousel);
  2202. // get related searches in image carousel
  2203. $relateds =
  2204. $this->fuckhtml
  2205. ->getElementsByClassName(
  2206. $this->getstyle(
  2207. [
  2208. "display" => "inline-block",
  2209. "margin-right" => "6px",
  2210. "outline" => "none",
  2211. "padding" => "6px 0"
  2212. ],
  2213. "a"
  2214. )
  2215. );
  2216. foreach($relateds as $related){
  2217. if(!isset($related["innerHTML"])){
  2218. // found an image
  2219. continue;
  2220. }
  2221. $text =
  2222. $this->fuckhtml
  2223. ->getTextContent(
  2224. $related
  2225. );
  2226. if($text != ""){
  2227. $out["related"][] = $text;
  2228. }
  2229. }
  2230. $div =
  2231. $this->fuckhtml
  2232. ->getElementsByTagName(
  2233. "div"
  2234. );
  2235. // get loaded images
  2236. $images =
  2237. $this->fuckhtml
  2238. ->getElementsByClassName(
  2239. "ivg-i",
  2240. $div
  2241. );
  2242. foreach($images as $image){
  2243. $this->fuckhtml->load($image);
  2244. $img_tags =
  2245. $this->fuckhtml
  2246. ->getElementsByTagName(
  2247. "img"
  2248. );
  2249. if(
  2250. !isset($image["attributes"]["data-docid"]) ||
  2251. !isset($this->image_arr[$image["attributes"]["data-docid"]])
  2252. ){
  2253. continue;
  2254. }
  2255. // search for the right image tag
  2256. $image_tag = false;
  2257. foreach($img_tags as $img){
  2258. if(
  2259. isset(
  2260. $img
  2261. ["attributes"]
  2262. ["alt"]
  2263. ) &&
  2264. trim(
  2265. $img
  2266. ["attributes"]
  2267. ["alt"]
  2268. ) != ""
  2269. ){
  2270. $image_tag = $img;
  2271. break;
  2272. }
  2273. }
  2274. if($image_tag === false){
  2275. continue;
  2276. }
  2277. $out["image"][] = [
  2278. "title" =>
  2279. $this->titledots(
  2280. $this->fuckhtml
  2281. ->getTextContent(
  2282. $image_tag
  2283. ["attributes"]
  2284. ["alt"]
  2285. )
  2286. ),
  2287. "source" =>
  2288. $this->image_arr[
  2289. $image
  2290. ["attributes"]
  2291. ["data-docid"]
  2292. ],
  2293. "url" =>
  2294. $this->fuckhtml
  2295. ->getTextContent(
  2296. $image
  2297. ["attributes"]
  2298. ["data-lpage"]
  2299. )
  2300. ];
  2301. }
  2302. // get unloaded javascript images
  2303. $images_js_sel =
  2304. $this->fuckhtml
  2305. ->getElementsByAttributeName(
  2306. "id",
  2307. $div
  2308. );
  2309. $loaded = [];
  2310. foreach($images_js_sel as $sel){
  2311. if(
  2312. !isset($this->blobs[$sel["attributes"]["id"]]) ||
  2313. in_array((string)$sel["attributes"]["id"], $loaded, true)
  2314. ){
  2315. // not an unloaded javascript image
  2316. continue;
  2317. }
  2318. $loaded[] = $sel["attributes"]["id"];
  2319. // get yet another javascript component
  2320. $this->fuckhtml->load($this->blobs[$sel["attributes"]["id"]]);
  2321. // get js node: contains title & url
  2322. $js_node =
  2323. $this->fuckhtml
  2324. ->getElementsByTagName(
  2325. "div"
  2326. )[0];
  2327. if(!isset($this->blobs[$js_node["attributes"]["id"]])){
  2328. // did not find refer id
  2329. continue;
  2330. }
  2331. // load second javascript component
  2332. $this->fuckhtml->load($this->blobs[$js_node["attributes"]["id"]]);
  2333. // get title from image alt text.
  2334. // data-src from this image is cropped, ignore it..
  2335. $img =
  2336. $this->fuckhtml
  2337. ->getElementsByTagName(
  2338. "img"
  2339. )[0];
  2340. $out["image"][] = [
  2341. "title" =>
  2342. $this->fuckhtml
  2343. ->getTextContent(
  2344. $img["attributes"]["alt"]
  2345. ),
  2346. "source" =>
  2347. $this->image_arr[
  2348. $js_node["attributes"]["data-docid"]
  2349. ],
  2350. "url" =>
  2351. $this->fuckhtml
  2352. ->getTextContent(
  2353. $js_node["attributes"]["data-lpage"]
  2354. )
  2355. ];
  2356. }
  2357. }
  2358. // reset
  2359. $this->fuckhtml->load($result_div);
  2360. }
  2361. //
  2362. // Parse videos
  2363. //
  2364. $this->fuckhtml->load($result_div);
  2365. $videos =
  2366. $this->fuckhtml
  2367. ->getElementsByAttributeName(
  2368. "data-vid",
  2369. "div"
  2370. );
  2371. foreach($videos as $video){
  2372. $this->fuckhtml->load($video);
  2373. // get url
  2374. $url =
  2375. $this->fuckhtml
  2376. ->getTextContent(
  2377. $video
  2378. ["attributes"]
  2379. ["data-surl"]
  2380. );
  2381. foreach($out["web"] as $link){
  2382. if($link["url"] == $url){
  2383. // ignore if we already have the video in $out["web"]
  2384. continue 2;
  2385. }
  2386. }
  2387. // get heading element
  2388. $heading =
  2389. $this->fuckhtml
  2390. ->getElementsByAttributeValue(
  2391. "role",
  2392. "heading",
  2393. "div"
  2394. );
  2395. if(count($heading) === 0){
  2396. // no heading, fuck this.
  2397. continue;
  2398. }
  2399. // get thumbnail before loading heading object
  2400. $image =
  2401. $this->fuckhtml
  2402. ->getElementsByAttributeName(
  2403. "id",
  2404. "img"
  2405. );
  2406. if(count($image) !== 0){
  2407. $thumb = [
  2408. "url" => $this->getdimg($image[0]["attributes"]["id"]),
  2409. "ratio" => "16:9"
  2410. ];
  2411. }else{
  2412. $thumb = [
  2413. "url" => null,
  2414. "ratio" => null
  2415. ];
  2416. }
  2417. // get duration
  2418. $duration_div =
  2419. $this->fuckhtml
  2420. ->getElementsByClassName(
  2421. $this->getstyle(
  2422. [
  2423. "border-radius" => "10px",
  2424. "font-family" => "arial,sans-serif-medium,sans-serif",
  2425. "font-size" => "12px",
  2426. "line-height" => "16px",
  2427. "padding-block" => "2px",
  2428. "padding-inline" => "8px"
  2429. ]
  2430. ),
  2431. "div"
  2432. );
  2433. if(count($duration_div) !== 0){
  2434. $duration =
  2435. $this->hms2int(
  2436. $this->fuckhtml
  2437. ->getTextContent(
  2438. $duration_div[0]
  2439. )
  2440. );
  2441. }else{
  2442. // check if its a livestream
  2443. $duration =
  2444. $this->fuckhtml
  2445. ->getElementsByClassName(
  2446. $this->getstyle(
  2447. [
  2448. "background-color" => "#d93025",
  2449. "border-radius" => "10px",
  2450. "color" => "#fff",
  2451. "font-family" => "arial,sans-serif-medium,sans-serif",
  2452. "font-size" => "12px",
  2453. "line-height" => "16px",
  2454. "padding-block" => "2px",
  2455. "padding-inline" => "8px"
  2456. ]
  2457. ),
  2458. "span"
  2459. );
  2460. if(count($duration) !== 0){
  2461. $duration = "_LIVE";
  2462. }else{
  2463. $duration = null;
  2464. }
  2465. }
  2466. // load heading
  2467. $this->fuckhtml->load($heading[0]);
  2468. // get title
  2469. $title =
  2470. $this->fuckhtml
  2471. ->getElementsByClassName(
  2472. $this->getstyle(
  2473. [
  2474. "font-family" => "arial,sans-serif",
  2475. "font-size" => "16px",
  2476. "font-weight" => "400",
  2477. "line-height" => "24px"
  2478. ]
  2479. ),
  2480. "div"
  2481. );
  2482. if(count($title) === 0){
  2483. // ?? no title
  2484. continue;
  2485. }
  2486. $title =
  2487. $this->titledots(
  2488. $this->fuckhtml
  2489. ->getTextContent(
  2490. $title[0]
  2491. )
  2492. );
  2493. // get date
  2494. $date_div =
  2495. $this->fuckhtml
  2496. ->getElementsByClassName(
  2497. $this->getstyle(
  2498. [
  2499. "color" => "var(" . $this->getcolorvar("#70757a") . ")",
  2500. "font-size" => "14px"
  2501. ]
  2502. ),
  2503. "div"
  2504. );
  2505. if(count($date_div) !== 0){
  2506. $date = strtotime(
  2507. $this->fuckhtml
  2508. ->getTextContent(
  2509. $date_div[0]
  2510. )
  2511. );
  2512. if($date === false){
  2513. // failed to parse date
  2514. $date = null;
  2515. }
  2516. }else{
  2517. $date = null;
  2518. }
  2519. $out["video"][] = [
  2520. "title" => $title,
  2521. "description" => null,
  2522. "date" => $date,
  2523. "duration" => $duration,
  2524. "views" => null,
  2525. "thumb" => $thumb,
  2526. "url" => $url
  2527. ];
  2528. }
  2529. //
  2530. // Parse featured results (which contain images, fuck the rest desu)
  2531. //
  2532. $this->fuckhtml->load($html);
  2533. $top =
  2534. $this->fuckhtml
  2535. ->getElementsByAttributeValue(
  2536. "aria-label",
  2537. "Featured results",
  2538. "div"
  2539. );
  2540. if(count($top) !== 0){
  2541. $this->fuckhtml->load($top[0]);
  2542. // get images
  2543. $grid =
  2544. $this->fuckhtml
  2545. ->getElementsByClassName(
  2546. $this->getstyle(
  2547. [
  2548. "border-radius" => "20px",
  2549. "display" => "grid",
  2550. "grid-gap" => "2px",
  2551. "grid-template-rows" => "repeat(2,minmax(0,1fr))",
  2552. "overflow" => "hidden",
  2553. "bottom" => "0",
  2554. "left" => "0",
  2555. "right" => "0",
  2556. "top" => "0",
  2557. "position" => "absolute",
  2558. ]
  2559. ),
  2560. "div"
  2561. );
  2562. if(count($grid) !== 0){
  2563. // we found image grid
  2564. $this->fuckhtml->load($grid[0]);
  2565. $images_div =
  2566. $this->fuckhtml
  2567. ->getElementsByAttributeName(
  2568. "data-attrid",
  2569. "div"
  2570. );
  2571. foreach($images_div as $image_div){
  2572. $this->fuckhtml->load($image_div);
  2573. $image =
  2574. $this->fuckhtml
  2575. ->getElementsByTagName(
  2576. "img"
  2577. );
  2578. if(
  2579. count($image) === 0 ||
  2580. !isset($image_div["attributes"]["data-docid"]) ||
  2581. !isset($this->image_arr[$image_div["attributes"]["data-docid"]])
  2582. ){
  2583. // ?? no image, continue
  2584. continue;
  2585. }
  2586. $out["image"][] = [
  2587. "title" =>
  2588. $this->titledots(
  2589. $this->fuckhtml
  2590. ->getTextContent(
  2591. $image[0]["attributes"]["alt"]
  2592. )
  2593. ),
  2594. "source" =>
  2595. $this->image_arr[
  2596. $image_div["attributes"]["data-docid"]
  2597. ],
  2598. "url" =>
  2599. $this->fuckhtml
  2600. ->getTextContent(
  2601. $image_div["attributes"]["data-lpage"]
  2602. )
  2603. ];
  2604. }
  2605. }
  2606. }
  2607. //
  2608. // craft $npt token
  2609. //
  2610. if(
  2611. $last_page === false &&
  2612. count($out["web"]) !== 0
  2613. ){
  2614. if(!isset($params["start"])){
  2615. $params["start"] = 20;
  2616. }else{
  2617. $params["start"] += 20;
  2618. }
  2619. $out["npt"] =
  2620. $this->backend
  2621. ->store(
  2622. json_encode($params),
  2623. $pagetype,
  2624. $proxy
  2625. );
  2626. }
  2627. //
  2628. // Parse right handside
  2629. //
  2630. $this->fuckhtml->load($html);
  2631. $rhs =
  2632. $this->fuckhtml
  2633. ->getElementById(
  2634. "rhs"
  2635. );
  2636. if($rhs === null){
  2637. return $out;
  2638. }
  2639. $this->fuckhtml->load($rhs);
  2640. // get images gallery
  2641. $image_gallery =
  2642. $this->fuckhtml
  2643. ->getElementsByAttributeValue(
  2644. "data-rc",
  2645. "ivg-i",
  2646. "div"
  2647. );
  2648. if(count($image_gallery) !== 0){
  2649. $this->fuckhtml->load($image_gallery[0]);
  2650. // get images
  2651. $images_div =
  2652. $this->fuckhtml
  2653. ->getElementsByClassName(
  2654. "ivg-i",
  2655. "div"
  2656. );
  2657. foreach($images_div as $image_div){
  2658. $this->fuckhtml->load($image_div);
  2659. $image =
  2660. $this->fuckhtml
  2661. ->getElementsByTagName(
  2662. "img"
  2663. );
  2664. if(
  2665. count($image) === 0 ||
  2666. !isset(
  2667. $this->image_arr[
  2668. $image_div
  2669. ["attributes"]
  2670. ["data-docid"]
  2671. ]
  2672. )
  2673. ){
  2674. continue;
  2675. }
  2676. foreach($out["image"] as $existing_image){
  2677. // might already exist
  2678. if(
  2679. $existing_image["source"][1]["url"] ==
  2680. $this->image_arr[
  2681. $image_div
  2682. ["attributes"]
  2683. ["data-docid"]
  2684. ][1]["url"]
  2685. ){
  2686. continue 2;
  2687. }
  2688. }
  2689. $out["image"][] = [
  2690. "title" =>
  2691. $this->titledots(
  2692. $this->fuckhtml
  2693. ->getTextContent(
  2694. $image[0]
  2695. ["attributes"]
  2696. ["alt"]
  2697. )
  2698. ),
  2699. "source" =>
  2700. $this->image_arr[
  2701. $image_div
  2702. ["attributes"]
  2703. ["data-docid"]
  2704. ],
  2705. "url" =>
  2706. $this->fuckhtml
  2707. ->getTextContent(
  2708. $image_div
  2709. ["attributes"]
  2710. ["data-lpage"]
  2711. )
  2712. ];
  2713. }
  2714. // reset
  2715. $this->fuckhtml->load($rhs);
  2716. }
  2717. // get header container
  2718. $header =
  2719. $this->fuckhtml
  2720. ->getElementsByClassName(
  2721. $this->getstyle(
  2722. [
  2723. "padding" => "0 0 16px 20px",
  2724. "display" => "flex"
  2725. ]
  2726. ),
  2727. "div"
  2728. );
  2729. // stop parsing wikipedia heads if there isn't a header
  2730. $description = [];
  2731. $title = "About";
  2732. if(count($header) !== 0){
  2733. $this->fuckhtml->load($header[0]);
  2734. // g-snackbar-action present: we found a button instead
  2735. if(
  2736. count(
  2737. $this->fuckhtml
  2738. ->getElementsByTagName(
  2739. "g-snackbar-action"
  2740. )
  2741. ) !== 0
  2742. ){
  2743. $title_tag =
  2744. $this->fuckhtml
  2745. ->getElementsByAttributeValue(
  2746. "data-attrid",
  2747. "title",
  2748. "div"
  2749. );
  2750. if(count($title_tag) !== 0){
  2751. $title =
  2752. $this->fuckhtml
  2753. ->getTextContent(
  2754. $title_tag[0]
  2755. );
  2756. $header[0]["innerHTML"] =
  2757. str_replace(
  2758. $title_tag[0]["outerHTML"],
  2759. "",
  2760. $header[0]["innerHTML"]
  2761. );
  2762. // if header still contains text, add it as a subtitle in description
  2763. $subtitle =
  2764. $this->fuckhtml
  2765. ->getTextContent(
  2766. $header[0]
  2767. );
  2768. if(strlen($subtitle) !== 0){
  2769. $description[] = [
  2770. "type" => "quote",
  2771. "value" => $subtitle
  2772. ];
  2773. }
  2774. }
  2775. }
  2776. // reset
  2777. $this->fuckhtml->load($rhs);
  2778. }
  2779. // get description elements
  2780. $url = null;
  2781. $text =
  2782. $this->fuckhtml
  2783. ->getElementsByAttributeValue(
  2784. "data-attrid",
  2785. "description",
  2786. "div"
  2787. );
  2788. if(count($text) !== 0){
  2789. $this->fuckhtml->load($text[0]);
  2790. $a =
  2791. $this->fuckhtml
  2792. ->getElementsByTagName(
  2793. "a"
  2794. );
  2795. if(count($a) !== 0){
  2796. // get link and remove it from description
  2797. $a = $a[count($a) - 1];
  2798. $text[0]["innerHTML"] =
  2799. str_replace(
  2800. $a["outerHTML"],
  2801. "",
  2802. $text[0]["innerHTML"]
  2803. );
  2804. $url =
  2805. $this->fuckhtml
  2806. ->getTextContent(
  2807. $a
  2808. ["attributes"]
  2809. ["href"]
  2810. );
  2811. }
  2812. $description[] = [
  2813. "type" => "text",
  2814. "value" =>
  2815. html_entity_decode(
  2816. preg_replace(
  2817. '/^Description/',
  2818. "",
  2819. $this->fuckhtml
  2820. ->getTextContent(
  2821. $text[0]
  2822. )
  2823. )
  2824. )
  2825. ];
  2826. // reset
  2827. $this->fuckhtml->load($rhs);
  2828. }
  2829. // get reviews (google play, steam, etc)
  2830. $review_container =
  2831. $this->fuckhtml
  2832. ->getElementsByClassName(
  2833. $this->getstyle(
  2834. [
  2835. "align-items" => "start",
  2836. "display" => "flex"
  2837. ]
  2838. ),
  2839. "div"
  2840. );
  2841. if(count($review_container) !== 0){
  2842. $this->fuckhtml->load($review_container[0]);
  2843. $as =
  2844. $this->fuckhtml
  2845. ->getElementsByTagName(
  2846. "a"
  2847. );
  2848. if(count($as) !== 0){
  2849. $description[] = [
  2850. "type" => "title",
  2851. "value" => "Ratings"
  2852. ];
  2853. foreach($as as $a){
  2854. $this->fuckhtml->load($a);
  2855. $spans =
  2856. $this->fuckhtml
  2857. ->getElementsByTagName(
  2858. "span"
  2859. );
  2860. if(count($spans) >= 2){
  2861. $value =
  2862. trim(
  2863. $this->fuckhtml
  2864. ->getTextContent(
  2865. $spans[1]
  2866. ),
  2867. "· "
  2868. );
  2869. if(
  2870. $value == "" &&
  2871. isset($spans[2])
  2872. ){
  2873. $value =
  2874. $this->fuckhtml
  2875. ->getTextContent(
  2876. $spans[2]
  2877. );
  2878. }
  2879. $description[] = [
  2880. "type" => "link",
  2881. "url" =>
  2882. $this->fuckhtml
  2883. ->getTextContent(
  2884. $a["attributes"]
  2885. ["href"]
  2886. ),
  2887. "value" => $value
  2888. ];
  2889. $description[] = [
  2890. "type" => "text",
  2891. "value" =>
  2892. ": " .
  2893. $this->fuckhtml
  2894. ->getTextContent(
  2895. $spans[0]
  2896. ) . "\n"
  2897. ];
  2898. }
  2899. }
  2900. }
  2901. // reset
  2902. $this->fuckhtml->load($rhs);
  2903. }
  2904. // initialize sublinks
  2905. $sublinks = [];
  2906. // get description from business
  2907. if(count($description) === 0){
  2908. $data_attrid =
  2909. $this->fuckhtml
  2910. ->getElementsByAttributeName(
  2911. "data-attrid"
  2912. );
  2913. $summary =
  2914. $this->fuckhtml
  2915. ->getElementsByAttributeValue(
  2916. "data-attrid",
  2917. "kc:/local:one line summary",
  2918. $data_attrid
  2919. );
  2920. if(count($summary) !== 0){
  2921. $description[] = [
  2922. "type" => "quote",
  2923. "value" =>
  2924. $this->fuckhtml
  2925. ->getTextContent(
  2926. $summary[0]
  2927. )
  2928. ];
  2929. // remove summary so it doesnt get parsed as a table
  2930. $rhs["innerHTML"] =
  2931. str_replace(
  2932. $summary[0]["outerHTML"],
  2933. "",
  2934. $rhs["innerHTML"]
  2935. );
  2936. $this->fuckhtml->load($rhs);
  2937. }
  2938. $address =
  2939. $this->fuckhtml
  2940. ->getElementsByAttributeValue(
  2941. "data-attrid",
  2942. "kc:/location/location:address",
  2943. $data_attrid
  2944. );
  2945. if(count($address) !== 0){
  2946. $description[] = [
  2947. "type" => "text",
  2948. "value" =>
  2949. $this->fuckhtml
  2950. ->getTextContent(
  2951. $address[0]
  2952. )
  2953. ];
  2954. }
  2955. // get title
  2956. $title_div =
  2957. $this->fuckhtml
  2958. ->getElementsByAttributeValue(
  2959. "data-attrid",
  2960. "title",
  2961. $data_attrid
  2962. );
  2963. if(count($title_div) !== 0){
  2964. $title =
  2965. $this->fuckhtml
  2966. ->getTextContent(
  2967. $title_div[0]
  2968. );
  2969. }
  2970. // get phone number
  2971. $phone =
  2972. $this->fuckhtml
  2973. ->getElementsByAttributeValue(
  2974. "data-attrid",
  2975. "kc:/local:alt phone",
  2976. $data_attrid
  2977. );
  2978. if(count($phone) !== 0){
  2979. $this->fuckhtml->load($phone[0]);
  2980. $sublinks["Call"] =
  2981. "tel:" .
  2982. $this->fuckhtml
  2983. ->getTextContent(
  2984. $this->fuckhtml
  2985. ->getElementsByAttributeName(
  2986. "aria-label",
  2987. "span"
  2988. )[0]
  2989. );
  2990. $this->fuckhtml->load($rhs);
  2991. }
  2992. }
  2993. if(count($description) === 0){
  2994. // still no description? abort
  2995. return $out;
  2996. }
  2997. // get table elements
  2998. $table = [];
  2999. $table_elems =
  3000. $this->fuckhtml
  3001. ->getElementsByClassName(
  3002. $this->getstyle(
  3003. [
  3004. "margin-top" => "7px"
  3005. ]
  3006. ),
  3007. "div"
  3008. );
  3009. foreach($table_elems as $elem){
  3010. $this->fuckhtml->load($elem);
  3011. $spans =
  3012. $this->fuckhtml
  3013. ->getElementsByTagName(
  3014. "span"
  3015. );
  3016. if(count($spans) === 0){
  3017. // ?? invalid
  3018. continue;
  3019. }
  3020. $elem["innerHTML"] =
  3021. str_replace(
  3022. $spans[0]["outerHTML"],
  3023. "",
  3024. $elem["innerHTML"]
  3025. );
  3026. $key =
  3027. rtrim(
  3028. $this->fuckhtml
  3029. ->getTextContent(
  3030. $spans[0]
  3031. ),
  3032. ": "
  3033. );
  3034. if(
  3035. $key == "" ||
  3036. $key == "Phone"
  3037. ){
  3038. continue;
  3039. }
  3040. if($key == "Hours"){
  3041. $hours = [];
  3042. $this->fuckhtml->load($elem);
  3043. $trs =
  3044. $this->fuckhtml
  3045. ->getElementsByTagName(
  3046. "tr"
  3047. );
  3048. foreach($trs as $tr){
  3049. $this->fuckhtml->load($tr);
  3050. $tds =
  3051. $this->fuckhtml
  3052. ->getElementsByTagName(
  3053. "td"
  3054. );
  3055. if(count($tds) === 2){
  3056. $hours[] =
  3057. $this->fuckhtml
  3058. ->getTextContent(
  3059. $tds[0]
  3060. ) . ": " .
  3061. $this->fuckhtml
  3062. ->getTextContent(
  3063. $tds[1]
  3064. );
  3065. }
  3066. }
  3067. if(count($hours) !== 0){
  3068. $hours = implode("\n", $hours);
  3069. $table["Hours"] = $hours;
  3070. }
  3071. continue;
  3072. }
  3073. $table[$key] =
  3074. preg_replace(
  3075. '/ +/',
  3076. " ",
  3077. $this->fuckhtml
  3078. ->getTextContent(
  3079. $elem
  3080. )
  3081. );
  3082. }
  3083. // reset
  3084. $this->fuckhtml->load($rhs);
  3085. // get the website div
  3086. $as =
  3087. $this->fuckhtml
  3088. ->getElementsByAttributeValue(
  3089. "data-attrid",
  3090. "visit_official_site",
  3091. "a"
  3092. );
  3093. if(count($as) !== 0){
  3094. $sublinks["Website"] =
  3095. str_replace(
  3096. "http://",
  3097. "https://",
  3098. $this->fuckhtml
  3099. ->getTextContent(
  3100. $as[0]
  3101. ["attributes"]
  3102. ["href"]
  3103. )
  3104. );
  3105. }else{
  3106. // get website through button
  3107. $button =
  3108. $this->fuckhtml
  3109. ->getElementsByClassName(
  3110. "ab_button",
  3111. "a"
  3112. );
  3113. if(count($button) !== 0){
  3114. $sublinks["Website"] =
  3115. $this->unshiturl(
  3116. $this->fuckhtml
  3117. ->getTextContent(
  3118. $button[0]
  3119. ["attributes"]
  3120. ["href"]
  3121. )
  3122. );
  3123. }
  3124. }
  3125. // get social media links
  3126. $as =
  3127. $this->fuckhtml
  3128. ->getElementsByTagName(
  3129. "g-link"
  3130. );
  3131. foreach($as as $a){
  3132. $this->fuckhtml->load($a);
  3133. $link =
  3134. $this->fuckhtml
  3135. ->getElementsByTagName(
  3136. "a"
  3137. );
  3138. if(count($link) === 0){
  3139. continue;
  3140. }
  3141. $sublink_title =
  3142. $this->fuckhtml
  3143. ->getTextContent(
  3144. $a
  3145. );
  3146. if($sublink_title == "X (Twitter)"){
  3147. $sublink_title = "Twitter";
  3148. }
  3149. $sublinks[$sublink_title] =
  3150. $this->fuckhtml
  3151. ->getTextContent(
  3152. $link[0]
  3153. ["attributes"]
  3154. ["href"]
  3155. );
  3156. }
  3157. // reset
  3158. $this->fuckhtml->load($rhs);
  3159. // get those round containers
  3160. $containers =
  3161. $this->fuckhtml
  3162. ->getElementsByClassName(
  3163. "tpa-ci"
  3164. );
  3165. foreach($containers as $container){
  3166. $this->fuckhtml->load($container);
  3167. $as =
  3168. $this->fuckhtml
  3169. ->getElementsByTagName(
  3170. "a"
  3171. );
  3172. if(count($as) === 0){
  3173. continue;
  3174. }
  3175. $sublinks[
  3176. $this->fuckhtml
  3177. ->getTextContent(
  3178. $as[0]
  3179. )
  3180. ] =
  3181. $this->fuckhtml
  3182. ->getTextContent(
  3183. $as[0]
  3184. ["attributes"]
  3185. ["href"]
  3186. );
  3187. }
  3188. $out["answer"][] = [
  3189. "title" => $title,
  3190. "description" => $description,
  3191. "url" => $url,
  3192. "thumb" => null,
  3193. "table" => $table,
  3194. "sublink" => $sublinks
  3195. ];
  3196. return $out;
  3197. }
  3198. private function scrape_dimg($html){
  3199. // get images loaded through javascript
  3200. $this->dimg = [];
  3201. preg_match_all(
  3202. '/function\(\){google\.ldi=({.*?});/',
  3203. $html,
  3204. $dimg
  3205. );
  3206. if(isset($dimg[1])){
  3207. foreach($dimg[1] as $i){
  3208. $tmp = json_decode($i, true);
  3209. foreach($tmp as $key => $value){
  3210. $this->dimg[$key] =
  3211. $this->unshit_thumb(
  3212. $value
  3213. );
  3214. }
  3215. }
  3216. }
  3217. // get additional javascript base64 images
  3218. preg_match_all(
  3219. '/var s=\'(data:image\/[^\']+)\';var ii=\[((?:\'[^\']+\',?)+)\];/',
  3220. $html,
  3221. $dimg
  3222. );
  3223. if(isset($dimg[1])){
  3224. for($i=0; $i<count($dimg[1]); $i++){
  3225. $delims = explode(",", $dimg[2][$i]);
  3226. $string =
  3227. $this->fuckhtml
  3228. ->parseJsString(
  3229. $dimg[1][$i]
  3230. );
  3231. foreach($delims as $delim){
  3232. $this->dimg[trim($delim, "'")] = $string;
  3233. }
  3234. }
  3235. }
  3236. }
  3237. private function scrape_imagearr($html){
  3238. // get image links arrays
  3239. preg_match_all(
  3240. '/\[0,"([^"]+)",\["([^"]+)\",([0-9]+),([0-9]+)\],\["([^"]+)",([0-9]+),([0-9]+)\]/',
  3241. $html,
  3242. $image_arr
  3243. );
  3244. $this->image_arr = [];
  3245. if(isset($image_arr[1])){
  3246. for($i=0; $i<count($image_arr[1]); $i++){
  3247. $this->image_arr[$image_arr[1][$i]] =
  3248. [
  3249. [
  3250. "url" =>
  3251. $this->fuckhtml
  3252. ->parseJsString(
  3253. $image_arr[5][$i]
  3254. ),
  3255. "width" => (int)$image_arr[7][$i],
  3256. "height" => (int)$image_arr[6][$i]
  3257. ],
  3258. [
  3259. "url" =>
  3260. $this->unshit_thumb(
  3261. $this->fuckhtml
  3262. ->parseJsString(
  3263. $image_arr[2][$i]
  3264. )
  3265. ),
  3266. "width" => (int)$image_arr[4][$i],
  3267. "height" => (int)$image_arr[3][$i]
  3268. ]
  3269. ];
  3270. }
  3271. }
  3272. }
  3273. private function getdimg($dimg){
  3274. return isset($this->dimg[$dimg]) ? $this->dimg[$dimg] : null;
  3275. }
  3276. private function unshit_thumb($url){
  3277. // https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQINE2vbnNLHXqoZr3RVsaEJFyOsj1_BiBnJch-e1nyz3oia7Aj5xVj
  3278. // https://i.ytimg.com/vi/PZVIyA5ER3Y/mqdefault.jpg?sqp=-oaymwEFCJQBEFM&rs=AMzJL3nXeaCpdIar-ltNwl82Y82cIJfphA
  3279. $parts = parse_url($url);
  3280. if(
  3281. isset($parts["host"]) &&
  3282. preg_match(
  3283. '/tbn.*\.gstatic\.com/',
  3284. $parts["host"]
  3285. )
  3286. ){
  3287. parse_str($parts["query"], $params);
  3288. if(isset($params["q"])){
  3289. return "https://" . $parts["host"] . "/images?q=" . $params["q"];
  3290. }
  3291. }
  3292. return $url;
  3293. }
  3294. private function parsestyles(){
  3295. $styles = [];
  3296. $style_div =
  3297. $this->fuckhtml
  3298. ->getElementsByTagName(
  3299. "style"
  3300. );
  3301. $raw_styles = "";
  3302. foreach($style_div as $style){
  3303. $raw_styles .= $style["innerHTML"];
  3304. }
  3305. // filter out media/keyframe queries
  3306. $raw_styles =
  3307. preg_replace(
  3308. '/@\s*(?!font-face)[^{]+\s*{[\S\s]+?}\s*}/',
  3309. "",
  3310. $raw_styles
  3311. );
  3312. // get styles
  3313. preg_match_all(
  3314. '/(.+?){([\S\s]*?)}/',
  3315. $raw_styles,
  3316. $matches
  3317. );
  3318. for($i=0; $i<count($matches[1]); $i++){
  3319. // get style values
  3320. preg_match_all(
  3321. '/([^:;]+):([^;]*?(?:\([^)]+\)[^;]*?)?)(?:;|$)/',
  3322. $matches[2][$i],
  3323. $values_regex
  3324. );
  3325. $values = [];
  3326. for($k=0; $k<count($values_regex[1]); $k++){
  3327. $values[trim($values_regex[1][$k])] =
  3328. strtolower(trim($values_regex[2][$k]));
  3329. }
  3330. $names = explode(",", $matches[1][$i]);
  3331. // h1,h2,h3 will each get their own array index
  3332. foreach($names as $name){
  3333. $name = trim($name, "}\t\n\r\0\x0B");
  3334. foreach($values as $key => $value){
  3335. $styles[$name][$key] = $value;
  3336. }
  3337. }
  3338. }
  3339. foreach($styles as $key => $values){
  3340. $styles[$key]["_c"] = count($values);
  3341. }
  3342. $this->styles = $styles;
  3343. // get CSS colors
  3344. $this->css_colors = [];
  3345. if(isset($this->styles[":root"])){
  3346. foreach($this->styles[":root"] as $key => $value){
  3347. $this->css_colors[$value] = strtolower($key);
  3348. }
  3349. }
  3350. }
  3351. private function getstyle($styles){
  3352. $styles["_c"] = count($styles);
  3353. foreach($this->styles as $style_key => $style_values){
  3354. if(count(array_intersect_assoc($style_values, $styles)) === $styles["_c"] + 1){
  3355. $style_key =
  3356. explode(" ", $style_key);
  3357. $style_key = $style_key[count($style_key) - 1];
  3358. return
  3359. ltrim(
  3360. str_replace(
  3361. [".", "#"],
  3362. " ",
  3363. $style_key
  3364. )
  3365. );
  3366. }
  3367. }
  3368. return false;
  3369. }
  3370. private function getcolorvar($color){
  3371. if(isset($this->css_colors[$color])){
  3372. return $this->css_colors[$color];
  3373. }
  3374. return null;
  3375. }
  3376. public function web($get){
  3377. if($get["npt"]){
  3378. [$params, $proxy] = $this->backend->get($get["npt"], "web");
  3379. $params = json_decode($params, true);
  3380. $search = $params["q"];
  3381. }else{
  3382. $search = $get["s"];
  3383. $country = $get["country"];
  3384. $nsfw = $get["nsfw"];
  3385. $lang = $get["lang"];
  3386. $older = $get["older"];
  3387. $newer = $get["newer"];
  3388. $spellcheck = $get["spellcheck"];
  3389. $proxy = $this->backend->get_ip();
  3390. $offset = 0;
  3391. $params = [
  3392. "q" => $search,
  3393. "hl" => "en",
  3394. "num" => 20 // get 20 results
  3395. ];
  3396. // country
  3397. if($country != "any"){
  3398. $params["gl"] = $country;
  3399. }
  3400. // nsfw
  3401. $params["safe"] = $nsfw == "yes" ? "off" : "active";
  3402. // language
  3403. if($lang != "any"){
  3404. $params["lr"] = "lang_" . $lang;
  3405. }
  3406. // generate tbs
  3407. $tbs = [];
  3408. // get date
  3409. $older = $older === false ? null : date("m/d/Y", $older);
  3410. $newer = $newer === false ? null : date("m/d/Y", $newer);
  3411. if(
  3412. $older !== null ||
  3413. $newer !== null
  3414. ){
  3415. $tbs["cdr"] = "1";
  3416. $tbs["cd_min"] = $newer;
  3417. $tbs["cd_max"] = $older;
  3418. }
  3419. // spellcheck filter
  3420. if($spellcheck == "no"){
  3421. $params["nfpr"] = "1";
  3422. }
  3423. if(count($tbs) !== 0){
  3424. $params["tbs"] = "";
  3425. foreach($tbs as $key => $value){
  3426. $params["tbs"] .= $key . ":" . $value . ",";
  3427. }
  3428. $params["tbs"] = rtrim($params["tbs"], ",");
  3429. }
  3430. }
  3431. try{
  3432. $html =
  3433. $this->get(
  3434. $proxy,
  3435. "https://www.google.com/search",
  3436. $params
  3437. );
  3438. }catch(Exception $error){
  3439. throw new Exception("Failed to get HTML");
  3440. }
  3441. //$html = file_get_contents("scraper/google.txt");
  3442. return $this->parsepage($html, "web", $search, $proxy, $params);
  3443. }
  3444. public function video($get){
  3445. if($get["npt"]){
  3446. [$params, $proxy] = $this->backend->get($get["npt"], "video");
  3447. $params = json_decode($params, true);
  3448. $search = $params["q"];
  3449. }else{
  3450. $search = $get["s"];
  3451. $country = $get["country"];
  3452. $nsfw = $get["nsfw"];
  3453. $older = $get["older"];
  3454. $newer = $get["newer"];
  3455. $duration = $get["duration"];
  3456. $quality = $get["quality"];
  3457. $captions = $get["captions"];
  3458. $proxy = $this->backend->get_ip();
  3459. $params = [
  3460. "q" => $search,
  3461. "tbm" => "vid",
  3462. "hl" => "en",
  3463. "num" => "20"
  3464. ];
  3465. // country
  3466. if($country != "any"){
  3467. $params["gl"] = $country;
  3468. }
  3469. // nsfw
  3470. $params["safe"] = $nsfw == "yes" ? "off" : "active";
  3471. $tbs = [];
  3472. // get date
  3473. $older = $older === false ? null : date("m/d/Y", $older);
  3474. $newer = $newer === false ? null : date("m/d/Y", $newer);
  3475. if(
  3476. $older !== null ||
  3477. $newer !== null
  3478. ){
  3479. $tbs["cdr"] = "1";
  3480. $tbs["cd_min"] = $newer;
  3481. $tbs["cd_max"] = $older;
  3482. }
  3483. // duration
  3484. if($duration != "any"){
  3485. $tbs[] = "dur:" . $duration;
  3486. }
  3487. // quality
  3488. if($quality != "any"){
  3489. $tbs[] = "hq:" . $quality;
  3490. }
  3491. // captions
  3492. if($captions != "any"){
  3493. $tbs[] = "cc:" . $captions;
  3494. }
  3495. // append tbs
  3496. if(count($tbs) !== 0){
  3497. $params["tbs"] =
  3498. implode(",", $tbs);
  3499. }
  3500. }
  3501. try{
  3502. $html =
  3503. $this->get(
  3504. $proxy,
  3505. "https://www.google.com/search",
  3506. $params
  3507. );
  3508. }catch(Exception $error){
  3509. throw new Exception("Failed to get HTML");
  3510. }
  3511. //$html = file_get_contents("scraper/google.html");
  3512. $response = $this->parsepage($html, "videos", $search, $proxy, $params);
  3513. $out = [
  3514. "status" => "ok",
  3515. "npt" => $response["npt"],
  3516. "video" => [],
  3517. "author" => [],
  3518. "livestream" => [],
  3519. "playlist" => [],
  3520. "reel" => []
  3521. ];
  3522. foreach($response["web"] as $result){
  3523. $out["video"][] = [
  3524. "title" => $result["title"],
  3525. "description" => $result["description"],
  3526. "author" => [
  3527. "name" => isset($result["table"]["Author"]) ? $result["table"]["Author"] : null,
  3528. "url" => null,
  3529. "avatar" => null
  3530. ],
  3531. "date" => $result["date"],
  3532. "duration" => isset($result["table"]["Duration"]) ? $this->hms2int($result["table"]["Duration"]) : null,
  3533. "views" => null,
  3534. "thumb" => $result["thumb"],
  3535. "url" => $result["url"]
  3536. ];
  3537. }
  3538. return $out;
  3539. }
  3540. public function news($get){
  3541. if($get["npt"]){
  3542. [$req, $proxy] = $this->backend->get($get["npt"], "news");
  3543. /*parse_str(
  3544. parse_url($req, PHP_URL_QUERY),
  3545. $search
  3546. );*/
  3547. try{
  3548. $html =
  3549. $this->get(
  3550. $proxy,
  3551. "https://www.google.com" . $req,
  3552. []
  3553. );
  3554. }catch(Exception $error){
  3555. throw new Exception("Failed to get HTML");
  3556. }
  3557. }else{
  3558. $search = $get["s"];
  3559. $country = $get["country"];
  3560. $nsfw = $get["nsfw"];
  3561. $older = $get["older"];
  3562. $newer = $get["newer"];
  3563. $sort = $get["sort"];
  3564. $proxy = $this->backend->get_ip();
  3565. $params = [
  3566. "q" => $search,
  3567. "tbm" => "nws",
  3568. "hl" => "en",
  3569. "num" => "20"
  3570. ];
  3571. // country
  3572. if($country != "any"){
  3573. $params["gl"] = $country;
  3574. }
  3575. // nsfw
  3576. $params["safe"] = $nsfw == "yes" ? "off" : "active";
  3577. $tbs = [];
  3578. // get date
  3579. $older = $older === false ? null : date("m/d/Y", $older);
  3580. $newer = $newer === false ? null : date("m/d/Y", $newer);
  3581. if(
  3582. $older !== null ||
  3583. $newer !== null
  3584. ){
  3585. $tbs["cdr"] = "1";
  3586. $tbs["cd_min"] = $newer;
  3587. $tbs["cd_max"] = $older;
  3588. }
  3589. // relevance
  3590. if($sort == "date"){
  3591. $tbs["sbd"] = "1";
  3592. }
  3593. // append tbs
  3594. if(count($tbs) !== 0){
  3595. $params["tbs"] = "";
  3596. foreach($tbs as $key => $value){
  3597. $params["tbs"] .= $key . ":" . $value . ",";
  3598. }
  3599. $params["tbs"] = rtrim($params["tbs"], ",");
  3600. }
  3601. //$html = file_get_contents("scraper/google-news.html");
  3602. $html =
  3603. $this->get(
  3604. $proxy,
  3605. "https://www.google.com/search",
  3606. $params
  3607. );
  3608. }
  3609. $out = [
  3610. "status" => "ok",
  3611. "npt" => null,
  3612. "news" => []
  3613. ];
  3614. $this->fuckhtml->load($html);
  3615. $this->detect_sorry();
  3616. // get images
  3617. $this->scrape_dimg($html);
  3618. // parse styles
  3619. $this->parsestyles();
  3620. $center_col =
  3621. $this->fuckhtml
  3622. ->getElementById(
  3623. "center_col",
  3624. "div"
  3625. );
  3626. if($center_col === null){
  3627. throw new Exception("Could not grep result div");
  3628. }
  3629. $this->fuckhtml->load($center_col);
  3630. // get next page
  3631. $npt =
  3632. $this->fuckhtml
  3633. ->getElementById(
  3634. "pnnext",
  3635. "a"
  3636. );
  3637. if($npt !== false){
  3638. $out["npt"] =
  3639. $this->backend->store(
  3640. $this->fuckhtml
  3641. ->getTextContent(
  3642. $npt["attributes"]
  3643. ["href"]
  3644. ),
  3645. "news",
  3646. $proxy
  3647. );
  3648. }
  3649. $as =
  3650. $this->fuckhtml
  3651. ->getElementsByAttributeName(
  3652. "jsname",
  3653. "a"
  3654. );
  3655. foreach($as as $a){
  3656. $this->fuckhtml->load($a);
  3657. // get title
  3658. $title =
  3659. $this->fuckhtml
  3660. ->getElementsByAttributeValue(
  3661. "role",
  3662. "heading",
  3663. "div"
  3664. );
  3665. if(count($title) === 0){
  3666. continue;
  3667. }
  3668. $title =
  3669. $this->titledots(
  3670. $this->fuckhtml
  3671. ->getTextContent(
  3672. $title[0]
  3673. )
  3674. );
  3675. // get thumbnail
  3676. $image =
  3677. $this->fuckhtml
  3678. ->getElementsByAttributeName(
  3679. "id",
  3680. "img"
  3681. );
  3682. // check for padded title node, if found, we're inside a carousel
  3683. $probe =
  3684. $this->fuckhtml
  3685. ->getElementsByClassName(
  3686. $this->getstyle(
  3687. [
  3688. "padding" => "16px 16px 40px 16px"
  3689. ]
  3690. ),
  3691. "div"
  3692. );
  3693. if(count($probe) !== 0){
  3694. $probe = true;
  3695. }else{
  3696. $probe = false;
  3697. }
  3698. if(
  3699. count($image) !== 0 &&
  3700. !isset($image[0]["attributes"]["width"])
  3701. ){
  3702. $thumb = [
  3703. "url" =>
  3704. $this->getdimg(
  3705. $image[0]["attributes"]["id"]
  3706. ),
  3707. "ratio" => $probe === true ? "16:9" : "1:1"
  3708. ];
  3709. }else{
  3710. $thumb = [
  3711. "url" => null,
  3712. "ratio" => null
  3713. ];
  3714. }
  3715. $description = null;
  3716. if($probe === false){
  3717. $desc_divs =
  3718. $this->fuckhtml
  3719. ->getElementsByAttributeName(
  3720. "style",
  3721. "div"
  3722. );
  3723. foreach($desc_divs as $desc){
  3724. if(
  3725. strpos(
  3726. $desc["attributes"]["style"],
  3727. "margin-top:"
  3728. ) !== false
  3729. ){
  3730. $description =
  3731. $this->titledots(
  3732. $this->fuckhtml
  3733. ->getTextContent(
  3734. $desc
  3735. )
  3736. );
  3737. break;
  3738. }
  3739. }
  3740. }
  3741. // get author
  3742. $author =
  3743. $this->fuckhtml
  3744. ->getElementsByClassName(
  3745. $this->getstyle(
  3746. [
  3747. "overflow" => "hidden",
  3748. "text-align" => "left",
  3749. "text-overflow" => "ellipsis",
  3750. "white-space" => "nowrap",
  3751. "margin-bottom" => "8px"
  3752. ]
  3753. ),
  3754. "div"
  3755. );
  3756. if(count($author) !== 0){
  3757. $author =
  3758. $this->fuckhtml
  3759. ->getTextContent(
  3760. $author[0]
  3761. );
  3762. }else{
  3763. $author = null;
  3764. }
  3765. // get date
  3766. $date = null;
  3767. $date_div =
  3768. $this->fuckhtml
  3769. ->getElementsByAttributeName(
  3770. "style",
  3771. "div"
  3772. );
  3773. foreach($date_div as $d){
  3774. $this->fuckhtml->load($d);
  3775. $span =
  3776. $this->fuckhtml
  3777. ->getElementsByTagName(
  3778. "span"
  3779. );
  3780. if(
  3781. strpos(
  3782. $d["attributes"]["style"],
  3783. "bottom:"
  3784. ) !== false
  3785. ){
  3786. $date =
  3787. strtotime(
  3788. $this->fuckhtml
  3789. ->getTextContent(
  3790. $span[count($span) - 1]
  3791. )
  3792. );
  3793. break;
  3794. }
  3795. }
  3796. $out["news"][] = [
  3797. "title" => $title,
  3798. "author" => $author,
  3799. "description" => $description,
  3800. "date" => $date,
  3801. "thumb" => $thumb,
  3802. "url" =>
  3803. $this->unshiturl(
  3804. $a["attributes"]
  3805. ["href"]
  3806. )
  3807. ];
  3808. }
  3809. return $out;
  3810. }
  3811. public function image($get){
  3812. // generate parameters
  3813. if($get["npt"]){
  3814. [$params, $proxy] =
  3815. $this->backend->get(
  3816. $get["npt"],
  3817. "images"
  3818. );
  3819. $params = json_decode($params, true);
  3820. }else{
  3821. $search = $get["s"];
  3822. if(strlen($search) === 0){
  3823. throw new Exception("Search term is empty!");
  3824. }
  3825. $proxy = $this->backend->get_ip();
  3826. $country = $get["country"];
  3827. $nsfw = $get["nsfw"];
  3828. $time = $get["time"];
  3829. $size = $get["size"];
  3830. $ratio = $get["ratio"];
  3831. $color = $get["color"];
  3832. $type = $get["type"];
  3833. $format = $get["format"];
  3834. $rights = $get["rights"];
  3835. $params = [
  3836. "q" => $search,
  3837. "udm" => "2" // get images
  3838. ];
  3839. // country (image search uses cr instead of gl)
  3840. if($country != "any"){
  3841. $params["cr"] = "country" . strtoupper($country);
  3842. }
  3843. // nsfw
  3844. $params["safe"] = $nsfw == "yes" ? "off" : "active";
  3845. // generate tbs
  3846. $tbs = [];
  3847. // time
  3848. if($time != "any"){
  3849. $tbs["qdr"] = $time;
  3850. }
  3851. // size
  3852. if($size != "any"){
  3853. $params["imgsz"] = $size;
  3854. }
  3855. // ratio
  3856. if($ratio != "any"){
  3857. $params["imgar"] = $ratio;
  3858. }
  3859. // color
  3860. if($color != "any"){
  3861. if(
  3862. $color == "color" ||
  3863. $color == "trans"
  3864. ){
  3865. $params["imgc"] = $color;
  3866. }elseif($color == "bnw"){
  3867. $params["imgc"] = "gray";
  3868. }else{
  3869. $tbs["ic"] = "specific";
  3870. $tbs["isc"] = $color;
  3871. }
  3872. }
  3873. // type
  3874. if($type != "any"){
  3875. $tbs["itp"] = $type;
  3876. }
  3877. // format
  3878. if($format != "any"){
  3879. $params["as_filetype"] = $format;
  3880. }
  3881. // rights (tbs)
  3882. if($rights != "any"){
  3883. $tbs["sur"] = $rights;
  3884. }
  3885. // append tbs
  3886. if(count($tbs) !== 0){
  3887. $params["tbs"] = "";
  3888. foreach($tbs as $key => $value){
  3889. $params["tbs"] .= $key . ":" . $value . ",";
  3890. }
  3891. $params["tbs"] = rtrim($params["tbs"], ",");
  3892. }
  3893. }
  3894. /*
  3895. $handle = fopen("scraper/google-img.html", "r");
  3896. $html = fread($handle, filesize("scraper/google-img.html"));
  3897. fclose($handle);*/
  3898. try{
  3899. $html =
  3900. $this->get(
  3901. $proxy,
  3902. "https://www.google.com/search",
  3903. $params
  3904. );
  3905. }catch(Exception $error){
  3906. throw new Exception("Failed to get search page");
  3907. }
  3908. $this->fuckhtml->load($html);
  3909. $this->detect_sorry();
  3910. // get javascript images
  3911. $this->scrape_imagearr($html);
  3912. $out = [
  3913. "status" => "ok",
  3914. "npt" => null,
  3915. "image" => []
  3916. ];
  3917. $images =
  3918. $this->fuckhtml
  3919. ->getElementsByClassName(
  3920. "ivg-i",
  3921. "div"
  3922. );
  3923. foreach($images as $div){
  3924. $this->fuckhtml->load($div);
  3925. $image =
  3926. $this->fuckhtml
  3927. ->getElementsByTagName("img")[0];
  3928. $out["image"][] = [
  3929. "title" =>
  3930. $this->titledots(
  3931. $this->fuckhtml
  3932. ->getTextContent(
  3933. $image["attributes"]["alt"]
  3934. )
  3935. ),
  3936. "source" =>
  3937. $this->image_arr[
  3938. $div["attributes"]["data-docid"]
  3939. ],
  3940. "url" =>
  3941. $this->fuckhtml
  3942. ->getTextContent(
  3943. $div["attributes"]["data-lpage"]
  3944. )
  3945. ];
  3946. }
  3947. // as usual, no way to check if there is a next page reliably
  3948. if(count($out["image"]) > 50){
  3949. if(!isset($params["start"])){
  3950. $params["start"] = 10;
  3951. }else{
  3952. $params["start"] += 10;
  3953. }
  3954. $out["npt"] =
  3955. $this->backend
  3956. ->store(
  3957. json_encode($params),
  3958. "image",
  3959. $proxy
  3960. );
  3961. }
  3962. return $out;
  3963. }
  3964. private function unshiturl($url, $return_size = false){
  3965. // decode
  3966. $url =
  3967. $this->fuckhtml
  3968. ->getTextContent($url);
  3969. $url_parts = parse_url($url);
  3970. if(
  3971. !isset(
  3972. $url_parts["host"]
  3973. )
  3974. ){
  3975. // no host, we have a tracking url
  3976. parse_str($url_parts["query"], $query);
  3977. if(isset($query["imgurl"])){
  3978. $url = $query["imgurl"];
  3979. }
  3980. elseif(isset($query["q"])){
  3981. $url = $query["q"];
  3982. }
  3983. }
  3984. // rewrite URLs to remove extra tracking parameters
  3985. $domain = parse_url($url, PHP_URL_HOST);
  3986. if(
  3987. preg_match(
  3988. '/wikipedia.org$/',
  3989. $domain
  3990. )
  3991. ){
  3992. // rewrite wikipedia mobile URLs to desktop
  3993. $url =
  3994. $this->replacedomain(
  3995. $url,
  3996. preg_replace(
  3997. '/([a-z0-9]+)(\.m\.)/',
  3998. '$1.',
  3999. $domain
  4000. )
  4001. );
  4002. }
  4003. elseif(
  4004. preg_match(
  4005. '/imdb\.com$|youtube\.[^.]+$/',
  4006. $domain
  4007. )
  4008. ){
  4009. // rewrite imdb and youtube mobile URLs too
  4010. $url =
  4011. $this->replacedomain(
  4012. $url,
  4013. preg_replace(
  4014. '/^m\./',
  4015. "",
  4016. $domain
  4017. )
  4018. );
  4019. }
  4020. elseif(
  4021. preg_match(
  4022. '/play\.google\.[^.]+$/',
  4023. $domain
  4024. )
  4025. ){
  4026. // remove referrers from play.google.com
  4027. $oldquery = parse_url($url, PHP_URL_QUERY);
  4028. if($oldquery !== null){
  4029. parse_str($oldquery, $query);
  4030. if(isset($query["referrer"])){ unset($query["referrer"]); }
  4031. if(isset($query["hl"])){ unset($query["hl"]); }
  4032. if(isset($query["gl"])){ unset($query["gl"]); }
  4033. $query = http_build_query($query);
  4034. $url =
  4035. str_replace(
  4036. $oldquery,
  4037. $query,
  4038. $url
  4039. );
  4040. }
  4041. }
  4042. elseif(
  4043. preg_match(
  4044. '/twitter\.com$/',
  4045. $domain
  4046. )
  4047. ){
  4048. // remove more referrers from twitter.com
  4049. $oldquery = parse_url($url, PHP_URL_QUERY);
  4050. if($oldquery !== null){
  4051. parse_str($oldquery, $query);
  4052. if(isset($query["ref_src"])){ unset($query["ref_src"]); }
  4053. $query = http_build_query($query);
  4054. $url =
  4055. str_replace(
  4056. $oldquery,
  4057. $query,
  4058. $url
  4059. );
  4060. }
  4061. }
  4062. elseif(
  4063. preg_match(
  4064. '/maps\.google\.[^.]+/',
  4065. $domain
  4066. )
  4067. ){
  4068. if(stripos($url, "maps?") !== false){
  4069. //https://maps.google.com/maps?daddr=Johnny,+603+Rue+St+Georges,+Saint-J%C3%A9r%C3%B4me,+Quebec+J7Z+5B7
  4070. $query = parse_url($url, PHP_URL_QUERY);
  4071. if($query !== null){
  4072. parse_str($query, $query);
  4073. if(isset($query["daddr"])){
  4074. $url =
  4075. "https://maps.google.com/maps?daddr=" .
  4076. urlencode($query["daddr"]);
  4077. }
  4078. }
  4079. }
  4080. }
  4081. if($return_size){
  4082. return [
  4083. "url" => $url,
  4084. "ref" => isset($query["imgrefurl"]) ? $query["imgrefurl"] : null,
  4085. "thumb_width" => isset($query["tbnw"]) ? (int)$query["tbnw"] : null,
  4086. "thumb_height" => isset($query["tbnh"]) ? (int)$query["tbnh"] : null,
  4087. "image_width" => isset($query["w"]) ? (int)$query["w"] : null,
  4088. "image_height" => isset($query["h"]) ? (int)$query["h"] : null
  4089. ];
  4090. }
  4091. return $url;
  4092. }
  4093. private function replacedomain($url, $domain){
  4094. return
  4095. preg_replace(
  4096. '/(https?:\/\/)([^\/]+)/',
  4097. '$1' . $domain,
  4098. $url
  4099. );
  4100. }
  4101. private function titledots($title){
  4102. return trim($title, " .\t\n\r\0\x0B…");
  4103. }
  4104. private function hms2int($time){
  4105. $parts = explode(":", $time, 3);
  4106. $time = 0;
  4107. if(count($parts) === 3){
  4108. // hours
  4109. $time = $time + ((int)$parts[0] * 3600);
  4110. array_shift($parts);
  4111. }
  4112. if(count($parts) === 2){
  4113. // minutes
  4114. $time = $time + ((int)$parts[0] * 60);
  4115. array_shift($parts);
  4116. }
  4117. // seconds
  4118. $time = $time + (int)$parts[0];
  4119. return $time;
  4120. }
  4121. private function detect_sorry(){
  4122. $recaptcha =
  4123. $this->fuckhtml
  4124. ->getElementById(
  4125. "recaptcha",
  4126. "div"
  4127. );
  4128. if($recaptcha !== false){
  4129. throw new Exception("Google returned a captcha");
  4130. }
  4131. }
  4132. }