Source code for relaton.serializers.bibxml.abstracts

from typing import List, cast
from lxml import etree


__all__ = (
  'get_paragraphs',
)


[docs]def get_paragraphs(val: str) -> List[str]: """Returns paragraphs as plain text, stripping HTML if needed. """ try: return get_paragraphs_html(val) except (etree.XMLSyntaxError, ValueError): return get_paragraphs_plain(val)
def get_paragraphs_html(val: str) -> List[str]: tree = etree.fromstring(f'<main>{val}</main>') ps = [ p.text for p in tree.findall('p') if (getattr(p, 'text', '') or '') != '' ] if len(ps) > 0: # We can cast because we excluded falsey p.text return cast(List[str], ps) else: raise ValueError("No HTML text detected") def get_paragraphs_plain(val: str) -> List[str]: return [ p.strip() for p in val.split('\n\n') if p.strip() != '' ]