Source code for relaton.serializers.bibxml.abstracts
from typing import List, cast
from lxml import etree
__all__ = (
'get_paragraphs',
)
[docs]def get_paragraphs(val: str) -> List[str]:
"""Returns paragraphs as plain text,
stripping HTML if needed.
"""
try:
return get_paragraphs_html(val)
except (etree.XMLSyntaxError, ValueError):
return get_paragraphs_plain(val)
def get_paragraphs_html(val: str) -> List[str]:
tree = etree.fromstring(f'<main>{val}</main>')
ps = [
p.text for p in tree.findall('p')
if (getattr(p, 'text', '') or '') != ''
]
if len(ps) > 0:
# We can cast because we excluded falsey p.text
return cast(List[str], ps)
else:
raise ValueError("No HTML text detected")
def get_paragraphs_plain(val: str) -> List[str]:
return [
p.strip()
for p in val.split('\n\n')
if p.strip() != ''
]