add parser and archive

2024-08-13 12:04:48 -06:00 · 2024-08-13 12:04:48 -06:00 · 8b919284f3
commit 8b919284f3
parent 386461fdf9
3 changed files with 600345 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -162,3 +162,5 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 # output files
 output/
--- a/page-and-filedump.xml
+++ b/page-and-filedump.xml
--- a/parser.py
+++ b/parser.py
@ -0,0 +1,41 @@
 import os 
 import xml.etree.ElementTree as ET 
 import base64
 # Path to the XML dump file 
 dump_file = './page-and-filedump.xml' 
 # Directory to save the individual files 
 output_dir = 'output/' 
 # Parse the XML dump 
 tree = ET.parse(source=dump_file) 
 root = tree.getroot()
 # Iterate through the pages and save them 
 for page in root.findall('./{http://www.mediawiki.org/xml/export-0.11/}page'): 
    title = page.find('./{http://www.mediawiki.org/xml/export-0.11/}title').text
    text = page.find('./{http://www.mediawiki.org/xml/export-0.11/}revision/{http://www.mediawiki.org/xml/export-0.11/}text').text 
    print(title)
    if ":" in title:
        splitName = title.split(":")
        title = splitName[1]
        if splitName[0] == "File":
            content = page.find("./{http://www.mediawiki.org/xml/export-0.11/}upload/{http://www.mediawiki.org/xml/export-0.11/}contents").text
            img = base64.b64decode(content)
            f = open(output_dir+title, "wb")
            f.write(img)
            f.close()
            continue
    # Create the output file path 
    output_file = os.path.join(output_dir, f'{title}.html') 
    # Save the page content as an HTML file 
    with open(output_file, 'w', encoding='utf-8') as f: 
        f.write(f'<h1>{title}</h1>\n{text}')