Wiki-Archive/parser.py

import os 
import xml.etree.ElementTree as ET 
import base64
    
# Path to the XML dump file 
dump_file = './page-and-filedump.xml' 
    
# Directory to save the individual files 
output_dir = 'output/' 
    
# Parse the XML dump 
tree = ET.parse(source=dump_file) 
root = tree.getroot()

# Iterate through the pages and save them 
for page in root.findall('./{http://www.mediawiki.org/xml/export-0.11/}page'): 
    title = page.find('./{http://www.mediawiki.org/xml/export-0.11/}title').text
    text = page.find('./{http://www.mediawiki.org/xml/export-0.11/}revision/{http://www.mediawiki.org/xml/export-0.11/}text').text 

    print(title)

    if ":" in title:
        splitName = title.split(":")

        title = splitName[1]

        if splitName[0] == "File":
            content = page.find("./{http://www.mediawiki.org/xml/export-0.11/}upload/{http://www.mediawiki.org/xml/export-0.11/}contents").text
            img = base64.b64decode(content)
            f = open(output_dir+title, "wb")
            f.write(img)
            f.close()
            continue

        
    # Create the output file path 
    output_file = os.path.join(output_dir, f'{title}.html') 
        
    # Save the page content as an HTML file 
    with open(output_file, 'w', encoding='utf-8') as f: 
        f.write(f'<h1>{title}</h1>\n{text}')
add parser and archive 2024-08-13 18:04:48 +00:00			`import os`
			`import xml.etree.ElementTree as ET`
			`import base64`

			`# Path to the XML dump file`
			`dump_file = './page-and-filedump.xml'`

			`# Directory to save the individual files`
			`output_dir = 'output/'`

			`# Parse the XML dump`
			`tree = ET.parse(source=dump_file)`
			`root = tree.getroot()`

			`# Iterate through the pages and save them`
			`for page in root.findall('./{http://www.mediawiki.org/xml/export-0.11/}page'):`
			`title = page.find('./{http://www.mediawiki.org/xml/export-0.11/}title').text`
			`text = page.find('./{http://www.mediawiki.org/xml/export-0.11/}revision/{http://www.mediawiki.org/xml/export-0.11/}text').text`

			`print(title)`

			`if ":" in title:`
			`splitName = title.split(":")`

			`title = splitName[1]`

			`if splitName[0] == "File":`
			`content = page.find("./{http://www.mediawiki.org/xml/export-0.11/}upload/{http://www.mediawiki.org/xml/export-0.11/}contents").text`
			`img = base64.b64decode(content)`
			`f = open(output_dir+title, "wb")`
			`f.write(img)`
			`f.close()`
			`continue`


			`# Create the output file path`
			`output_file = os.path.join(output_dir, f'{title}.html')`

			`# Save the page content as an HTML file`
			`with open(output_file, 'w', encoding='utf-8') as f:`
			`f.write(f'<h1>{title}</h1>\n{text}')`