Browse Source

feat: remove unneeded classes from parsed data

Zubarev Grigoriy 7 months ago
parent
commit
6a09982bfe
1 changed files with 24 additions and 9 deletions
  1. 24 9
      src/rural_dict/__main__.py

+ 24 - 9
src/rural_dict/__main__.py

@@ -5,14 +5,25 @@ import sys
 import requests
 from flask import Flask, redirect, render_template, request
 from requests import JSONDecodeError
-from selectolax.parser import HTMLParser
+from selectolax.parser import HTMLParser, Node
 
 app = Flask(__name__, template_folder="templates", static_folder="static")
 
 
+def remove_classes(node: Node) -> Node:
+    """Remove all classes from all nodes recursively."""
+    if "class" in node.attributes:
+        del node.attrs["class"]
+
+    for child in node.iter():
+        remove_classes(child)
+    return node
+
+
 @app.route("/", defaults={"path": ""})
 @app.route("/<path:path>")
-def catch_all(path):
+def root_route(path):
+    """Check all routes on Urban Dictionary and redirect if needed."""
     path_without_host = re.sub(r"https?://[^/]+/", "", request.url)
     url = f"https://www.urbandictionary.com/{path_without_host}"
 
@@ -21,7 +32,7 @@ def catch_all(path):
     if data.history:
         return redirect(re.sub(r"https?://[^/]+", "", data.url), 302)
 
-    res = []
+    results = []
     parser = HTMLParser(data.text)
     definitions = parser.css("div[data-defid]")
     try:
@@ -31,23 +42,27 @@ def catch_all(path):
         )
         thumbs_json = requests.get(thumbs_api_url, timeout=10).json()["thumbs"]
         thumbs_data = {el["defid"]: el for el in thumbs_json}
-    except (KeyError, TypeError, JSONDecodeError):
+    except (KeyError, JSONDecodeError):
         thumbs_data = {}
 
     for definition in definitions:
         word = definition.css_first("a.word").text()
-        meaning = definition.css_first("div.meaning").html
-        example = definition.css_first("div.example").html
-        contributor = definition.css_first("div.contributor").html
+        meaning = remove_classes(definition.css_first("div.meaning")).html
+        example = remove_classes(definition.css_first("div.example")).html
+        contributor = remove_classes(definition.css_first("div.contributor")).html
         definition_id = int(definition.attributes["data-defid"])
         definition_thumbs = thumbs_data.get(definition_id, {})
         thumbs_up = definition_thumbs.get("up")
         thumbs_down = definition_thumbs.get("down")
-        res.append([definition_id, word, meaning, example, contributor, thumbs_up, thumbs_down])
+        results.append(
+            [definition_id, word, meaning, example, contributor, thumbs_up, thumbs_down]
+        )
     if (pagination := parser.css_first("div.pagination")) is not None:
+        pagination = remove_classes(pagination)
+        pagination.attrs["class"] = "pagination"
         pagination = pagination.html
 
-    return render_template("index.html", data=(res, pagination), term=request.args.get("term"))
+    return render_template("index.html", data=(results, pagination), term=request.args.get("term"))
 
 
 if __name__ == "__main__":