add parser and archive
This commit is contained in:
parent
386461fdf9
commit
8b919284f3
3 changed files with 600345 additions and 0 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -162,3 +162,5 @@ cython_debug/
|
||||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
#.idea/
|
#.idea/
|
||||||
|
|
||||||
|
# output files
|
||||||
|
output/
|
600302
page-and-filedump.xml
Normal file
600302
page-and-filedump.xml
Normal file
File diff suppressed because it is too large
Load diff
41
parser.py
Normal file
41
parser.py
Normal file
|
@ -0,0 +1,41 @@
|
||||||
|
import os
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
import base64
|
||||||
|
|
||||||
|
# Path to the XML dump file
|
||||||
|
dump_file = './page-and-filedump.xml'
|
||||||
|
|
||||||
|
# Directory to save the individual files
|
||||||
|
output_dir = 'output/'
|
||||||
|
|
||||||
|
# Parse the XML dump
|
||||||
|
tree = ET.parse(source=dump_file)
|
||||||
|
root = tree.getroot()
|
||||||
|
|
||||||
|
# Iterate through the pages and save them
|
||||||
|
for page in root.findall('./{http://www.mediawiki.org/xml/export-0.11/}page'):
|
||||||
|
title = page.find('./{http://www.mediawiki.org/xml/export-0.11/}title').text
|
||||||
|
text = page.find('./{http://www.mediawiki.org/xml/export-0.11/}revision/{http://www.mediawiki.org/xml/export-0.11/}text').text
|
||||||
|
|
||||||
|
print(title)
|
||||||
|
|
||||||
|
if ":" in title:
|
||||||
|
splitName = title.split(":")
|
||||||
|
|
||||||
|
title = splitName[1]
|
||||||
|
|
||||||
|
if splitName[0] == "File":
|
||||||
|
content = page.find("./{http://www.mediawiki.org/xml/export-0.11/}upload/{http://www.mediawiki.org/xml/export-0.11/}contents").text
|
||||||
|
img = base64.b64decode(content)
|
||||||
|
f = open(output_dir+title, "wb")
|
||||||
|
f.write(img)
|
||||||
|
f.close()
|
||||||
|
continue
|
||||||
|
|
||||||
|
|
||||||
|
# Create the output file path
|
||||||
|
output_file = os.path.join(output_dir, f'{title}.html')
|
||||||
|
|
||||||
|
# Save the page content as an HTML file
|
||||||
|
with open(output_file, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(f'<h1>{title}</h1>\n{text}')
|
Loading…
Reference in a new issue