mirrors
/
mikuinvidious
mirror of https://0xacab.org/johnxina/mikuinvidious.git


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
							from bs4 import BeautifulSoup
from conf import always_proxy

"""Format the result returned by cv link."""
def html_format_article(article_text):
	article_soup = BeautifulSoup(article_text, features='lxml')
	article_body = article_soup.find('div', id='read-article-holder')
	article_body.attrs = {}
	article_body['id'] = 'main-article'

	del_elems = []
	purge_elems = []
	for child in article_body.descendants:
		if not child.name:
			continue

		if child.name.startswith('h'):
			child.name = f'h{int(child.name[1:])+1}'

		if child.name == 'br' and child.parent.name != 'blockquote':
			del_elems.append(child)
			continue
		elif child.name == 'strong' and child.parent.name.startswith('h'):
			purge_elems.append(child.parent)

		if not hasattr(child, 'attrs'):
			continue

		if child.name == 'a':
			child['href'] = child['href'].split('//')[1].strip('www.bilibili.com')
			continue
		elif child.name == 'img':
			if always_proxy:
				child['src'] = '/proxy/pic/http:' + child['data-src']
			else:
				child['src'] = child['data-src']
			del child['data-src']
			del child['data-size']
			continue
		elif child.name == 'span' and not child.parent in purge_elems:
			purge_elems.append(child.parent)
		
		child.attrs = {}

	for purge_elem in purge_elems:
		try:
			purge_elem.string = purge_elem.get_text()
		except:
			pass

	for del_elem in del_elems:
		try:
			del_elem.extract()
		except:
			pass

	return str(article_body)