Coverage for src/xml.py: 67%

21 statements  

« prev     ^ index     » next       coverage.py v7.3.0, created at 2024-04-27 07:17 -0500

1import xml.etree.ElementTree 

2 

3xml.etree.ElementTree.register_namespace("", "http://www.w3.org/2005/Atom") 

4xml.etree.ElementTree.register_namespace( 

5 "media", "http://search.yahoo.com/mrss/") 

6 

7 

8class ParseError(BaseException): 

9 """There was an unparsable error in the XML input.""" 

10 

11 

12def prettify(content: str) -> str: 

13 """Prettify an XML string 

14 

15 Raises a `ParseError` if the input is invalid. 

16 

17 >>> prettify('<some><xml></xml></some>') 

18 <some> 

19 <xml /> 

20 </some> 

21 """ 

22 # if there is a doctype, trim it off 

23 doctype, *rest = content.splitlines() 

24 doctype = doctype.lower() 

25 if doctype.startswith('<!doctype') or doctype.startswith('<?xml'): 

26 content = '\n'.join(rest) 

27 else: 

28 doctype = None 

29 

30 # escape special characters 

31 content = content.replace('&', '&amp;') 

32 

33 # convert to a tree 

34 try: 

35 tree = xml.etree.ElementTree.fromstring(content) 

36 except xml.etree.ElementTree.ParseError as e: 

37 raise ParseError( 

38 f'error parsing the following ({e.args})\n{content[:800]}') 

39 

40 # add indenting 

41 xml.etree.ElementTree.indent(tree) 

42 

43 # decode back to string 

44 content = xml.etree.ElementTree.tostring( 

45 tree, encoding='utf8').decode('utf-8') 

46 

47 # trim off the first line, which is the doctype 

48 content = '\n'.join(content.splitlines()[1:]) 

49 

50 # reattach the original doctype, if there was one. 

51 if doctype: 

52 content = doctype + '\n' + content 

53 

54 return content