123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657 |
- from bs4 import BeautifulSoup
- from conf import always_proxy
- """Format the result returned by cv link."""
- def html_format_article(article_text):
- article_soup = BeautifulSoup(article_text, features='lxml')
- article_body = article_soup.find('div', id='read-article-holder')
- article_body.attrs = {}
- article_body['id'] = 'main-article'
- del_elems = []
- purge_elems = []
- for child in article_body.descendants:
- if not child.name:
- continue
- if child.name.startswith('h'):
- child.name = f'h{int(child.name[1:])+1}'
- if child.name == 'br' and child.parent.name != 'blockquote':
- del_elems.append(child)
- continue
- elif child.name == 'strong' and child.parent.name.startswith('h'):
- purge_elems.append(child.parent)
- if not hasattr(child, 'attrs'):
- continue
- if child.name == 'a':
- child['href'] = child['href'].split('//')[1].strip('www.bilibili.com')
- continue
- elif child.name == 'img':
- if always_proxy:
- child['src'] = '/proxy/pic/http:' + child['data-src']
- else:
- child['src'] = child['data-src']
- del child['data-src']
- del child['data-size']
- continue
- elif child.name == 'span' and not child.parent in purge_elems:
- purge_elems.append(child.parent)
-
- child.attrs = {}
- for purge_elem in purge_elems:
- try:
- purge_elem.string = purge_elem.get_text()
- except:
- pass
- for del_elem in del_elems:
- try:
- del_elem.extract()
- except:
- pass
- return str(article_body)
|