htmlp.py 1.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
  1. from bs4 import BeautifulSoup
  2. from conf import always_proxy
  3. """Format the result returned by cv link."""
  4. def html_format_article(article_text):
  5. article_soup = BeautifulSoup(article_text, features='lxml')
  6. article_body = article_soup.find('div', id='read-article-holder')
  7. article_body.attrs = {}
  8. article_body['id'] = 'main-article'
  9. del_elems = []
  10. purge_elems = []
  11. for child in article_body.descendants:
  12. if not child.name:
  13. continue
  14. if child.name.startswith('h'):
  15. child.name = f'h{int(child.name[1:])+1}'
  16. if child.name == 'br' and child.parent.name != 'blockquote':
  17. del_elems.append(child)
  18. continue
  19. elif child.name == 'strong' and child.parent.name.startswith('h'):
  20. purge_elems.append(child.parent)
  21. if not hasattr(child, 'attrs'):
  22. continue
  23. if child.name == 'a':
  24. child['href'] = child['href'].split('//')[1].strip('www.bilibili.com')
  25. continue
  26. elif child.name == 'img':
  27. if always_proxy:
  28. child['src'] = '/proxy/pic/http:' + child['data-src']
  29. else:
  30. child['src'] = child['data-src']
  31. del child['data-src']
  32. del child['data-size']
  33. continue
  34. elif child.name == 'span' and not child.parent in purge_elems:
  35. purge_elems.append(child.parent)
  36. child.attrs = {}
  37. for purge_elem in purge_elems:
  38. try:
  39. purge_elem.string = purge_elem.get_text()
  40. except:
  41. pass
  42. for del_elem in del_elems:
  43. try:
  44. del_elem.extract()
  45. except:
  46. pass
  47. return str(article_body)