add parser and archive

This commit is contained in:
PossiblyAxolotl 2024-08-13 12:04:48 -06:00
parent 386461fdf9
commit 8b919284f3
3 changed files with 600345 additions and 0 deletions

2
.gitignore vendored
View file

@ -162,3 +162,5 @@ cython_debug/
# option (not recommended) you can uncomment the following to ignore the entire idea folder. # option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/ #.idea/
# output files
output/

600302
page-and-filedump.xml Normal file

File diff suppressed because it is too large Load diff

41
parser.py Normal file
View file

@ -0,0 +1,41 @@
import os
import xml.etree.ElementTree as ET
import base64
# Path to the XML dump file
dump_file = './page-and-filedump.xml'
# Directory to save the individual files
output_dir = 'output/'
# Parse the XML dump
tree = ET.parse(source=dump_file)
root = tree.getroot()
# Iterate through the pages and save them
for page in root.findall('./{http://www.mediawiki.org/xml/export-0.11/}page'):
title = page.find('./{http://www.mediawiki.org/xml/export-0.11/}title').text
text = page.find('./{http://www.mediawiki.org/xml/export-0.11/}revision/{http://www.mediawiki.org/xml/export-0.11/}text').text
print(title)
if ":" in title:
splitName = title.split(":")
title = splitName[1]
if splitName[0] == "File":
content = page.find("./{http://www.mediawiki.org/xml/export-0.11/}upload/{http://www.mediawiki.org/xml/export-0.11/}contents").text
img = base64.b64decode(content)
f = open(output_dir+title, "wb")
f.write(img)
f.close()
continue
# Create the output file path
output_file = os.path.join(output_dir, f'{title}.html')
# Save the page content as an HTML file
with open(output_file, 'w', encoding='utf-8') as f:
f.write(f'<h1>{title}</h1>\n{text}')