Browse Source

feat: rewrite data scraping in selectolax

This library is written in C and is more performant that bs4.
Zubarev Grigoriy 7 months ago
parent
commit
38e8868a9c
4 changed files with 28 additions and 32 deletions
  1. 1 1
      pyproject.toml
  2. 2 4
      requirements-dev.lock
  3. 2 4
      requirements.lock
  4. 23 23
      src/rural_dict/__main__.py

+ 1 - 1
pyproject.toml

@@ -11,10 +11,10 @@ authors = [
     { name = "zortazert", email = "zortazert@matthewevan.xyz" },
 ]
 dependencies = [
-    "beautifulsoup4~=4.12.3",
     "requests~=2.32.3",
     "flask~=3.0.3",
     "waitress~=3.0.0",
+    "selectolax>=0.3.21",
 ]
 dynamic = ["version"]
 

+ 2 - 4
requirements-dev.lock

@@ -10,8 +10,6 @@
 #   universal: true
 
 -e file:.
-beautifulsoup4==4.12.3
-    # via rural-dict
 blinker==1.8.2
     # via flask
 certifi==2024.7.4
@@ -35,8 +33,8 @@ markupsafe==2.1.5
     # via werkzeug
 requests==2.32.3
     # via rural-dict
-soupsieve==2.6
-    # via beautifulsoup4
+selectolax==0.3.21
+    # via rural-dict
 urllib3==2.2.2
     # via requests
 waitress==3.0.0

+ 2 - 4
requirements.lock

@@ -10,8 +10,6 @@
 #   universal: true
 
 -e file:.
-beautifulsoup4==4.12.3
-    # via rural-dict
 blinker==1.8.2
     # via flask
 certifi==2024.7.4
@@ -35,8 +33,8 @@ markupsafe==2.1.5
     # via werkzeug
 requests==2.32.3
     # via rural-dict
-soupsieve==2.6
-    # via beautifulsoup4
+selectolax==0.3.21
+    # via rural-dict
 urllib3==2.2.2
     # via requests
 waitress==3.0.0

+ 23 - 23
src/rural_dict/__main__.py

@@ -3,8 +3,9 @@ import re
 import sys
 
 import requests
-from bs4 import BeautifulSoup
 from flask import Flask, redirect, render_template, request
+from requests import JSONDecodeError
+from selectolax.parser import HTMLParser
 
 app = Flask(__name__, template_folder="templates", static_folder="static")
 
@@ -21,31 +22,30 @@ def catch_all(path):
         return redirect(re.sub(r"https?://[^/]+", "", data.url), 302)
 
     res = []
-    soup = BeautifulSoup(data.text, "html.parser")
-    defs = [(div, div.get("data-defid")) for div in soup.find_all("div") if div.get("data-defid")]
+    parser = HTMLParser(data.text)
+    definitions = parser.css("div[data-defid]")
     try:
-        thumbs_data = {
-            str(entry["defid"]): entry
-            for entry in requests.get(
-                "https://api.urbandictionary.com/v0/uncacheable?ids="
-                + ",".join(defid for (_, defid) in defs),
-                timeout=10,
-            ).json()["thumbs"]
-        }
-    except (KeyError, TypeError):
+        thumbs_api_url = (
+            f'https://api.urbandictionary.com/v0/uncacheable?ids='
+            f'{",".join(d.attributes["data-defid"] for d in definitions)}'
+        )
+        thumbs_json = requests.get(thumbs_api_url, timeout=10).json()["thumbs"]
+        thumbs_data = {el["defid"]: el for el in thumbs_json}
+    except (KeyError, TypeError, JSONDecodeError):
         thumbs_data = {}
 
-    for definition, defid in defs:
-        word = definition.select("div div h1 a, div div h2 a")[0].text
-        meaning = definition.find(attrs={"class": ["break-words meaning mb-4"]}).decode_contents()
-        example = definition.find(
-            attrs={"class": ["break-words example italic mb-4"]}
-        ).decode_contents()
-        contributor = definition.find(attrs={"class": ["contributor font-bold"]})
-        thumbs_up = thumbs_data.get(defid, {}).get("up")
-        thumbs_down = thumbs_data.get(defid, {}).get("down")
-        res.append([defid, word, meaning, example, contributor, thumbs_up, thumbs_down])
-    pagination = soup.find(attrs={"class": ["pagination text-xl text-center"]}) or ""
+    for definition in definitions:
+        word = definition.css_first("a.word").text()
+        meaning = definition.css_first("div.meaning").html
+        example = definition.css_first("div.example").html
+        contributor = definition.css_first("div.contributor").html
+        definition_id = int(definition.attributes["data-defid"])
+        definition_thumbs = thumbs_data.get(definition_id, {})
+        thumbs_up = definition_thumbs.get("up")
+        thumbs_down = definition_thumbs.get("down")
+        res.append([definition_id, word, meaning, example, contributor, thumbs_up, thumbs_down])
+    if (pagination := parser.css_first("div.pagination")) is not None:
+        pagination = pagination.html
 
     return render_template("index.html", data=(res, pagination), term=request.args.get("term"))