import os import xml.etree.ElementTree as ET import base64 # Path to the XML dump file dump_file = './page-and-filedump.xml' # Directory to save the individual files output_dir = 'output/' # Parse the XML dump tree = ET.parse(source=dump_file) root = tree.getroot() # Create output folder if needed if not os.path.exists(output_dir): os.mkdir(output_dir) # Iterate through the pages and save them for page in root.findall('./{http://www.mediawiki.org/xml/export-0.11/}page'): title = page.find('./{http://www.mediawiki.org/xml/export-0.11/}title').text text = page.find('./{http://www.mediawiki.org/xml/export-0.11/}revision/{http://www.mediawiki.org/xml/export-0.11/}text').text print(title) # Check for and process files differently if ":" in title: splitName = title.split(":") title = splitName[1] if splitName[0] == "File": content = page.find("./{http://www.mediawiki.org/xml/export-0.11/}upload/{http://www.mediawiki.org/xml/export-0.11/}contents").text img = base64.b64decode(content) f = open(output_dir+title, "wb") f.write(img) f.close() continue # Create the output file path output_file = os.path.join(output_dir, f'{title}.html') # Save the page content as an HTML file with open(output_file, 'w', encoding='utf-8') as f: f.write(f'

{title}

\n{text}')