Skip to content

Commit de17607

Browse files
authored
Update generate_sitemap.py
1 parent 880c102 commit de17607

File tree

1 file changed

+28
-26
lines changed

1 file changed

+28
-26
lines changed

generate_sitemap.py

Lines changed: 28 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import requests
22
import xml.etree.ElementTree as ET
3+
from xml.dom import minidom
34
from tqdm import tqdm
45
from urllib.parse import quote
56

@@ -34,12 +35,14 @@ def fetch_sitemap(url):
3435
response.raise_for_status()
3536
return response.text
3637

37-
def serialize_xml(element):
38-
"""
39-
Serialize the XML Element to a bytes object with XML declaration,
40-
without any additional formatting (all content in one line).
41-
"""
42-
return ET.tostring(element, encoding='utf-8', xml_declaration=True)
38+
def prettify_xml(element):
39+
"""Prettify and return a string representation of the XML with XML declaration including encoding."""
40+
rough_string = ET.tostring(element, encoding='utf-8')
41+
reparsed = minidom.parseString(rough_string)
42+
# Specify encoding to include it in the XML declaration
43+
pretty = reparsed.toprettyxml(indent=" ", encoding="UTF-8")
44+
# Decode bytes to string for writing to file
45+
return pretty.decode('UTF-8')
4346

4447
def encode_url(url):
4548
"""Encode the URL to make it XML-safe and RFC-compliant."""
@@ -54,7 +57,14 @@ def add_static_urls_without_translations(root, urls):
5457
root.append(url_element)
5558

5659
def add_translated_urls(url_element, original_url):
57-
"""Add translated URLs with language codes appended to the path."""
60+
"""Add translated URLs with language codes appended to the path, including x-default."""
61+
# Add x-default hreflang pointing to the original URL
62+
alt_link_default = ET.SubElement(url_element, '{http://www.w3.org/1999/xhtml}link')
63+
alt_link_default.set('rel', 'alternate')
64+
alt_link_default.set('hreflang', 'x-default')
65+
alt_link_default.set('href', encode_url(original_url))
66+
67+
# Add hreflang links for each language
5868
for hreflang, lang_code in languages.items():
5969
# Add the language code to the path
6070
path_parts = original_url.split('/', 3)
@@ -75,11 +85,13 @@ def main():
7585
cloud_sitemap_url = "https://cloud.hacktricks.xyz/sitemap.xml"
7686

7787
# Fetch both sitemaps
88+
print("Fetching sitemaps...")
7889
book_sitemap_data = fetch_sitemap(book_sitemap_url)
7990
cloud_sitemap_data = fetch_sitemap(cloud_sitemap_url)
8091

8192
# Parse XML
8293
ns = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
94+
print("Parsing sitemaps...")
8395
book_root = ET.fromstring(book_sitemap_data)
8496
cloud_root = ET.fromstring(cloud_sitemap_data)
8597

@@ -106,9 +118,11 @@ def main():
106118
"https://training.hacktricks.xyz/terms",
107119
"https://training.hacktricks.xyz/privacy",
108120
]
121+
print("Adding static URLs without translations...")
109122
add_static_urls_without_translations(new_root, static_training_urls)
110123

111124
# Process main URLs from book and cloud hacktricks sitemaps
125+
print("Processing main URLs with translations...")
112126
for url_element in tqdm(all_urls, desc="Processing URLs"):
113127
loc = url_element.find('ns:loc', ns)
114128
if loc is None:
@@ -135,30 +149,18 @@ def main():
135149
lastmod_el = ET.SubElement(url_entry, '{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod')
136150
lastmod_el.text = lastmod.text
137151

138-
# Add translations
152+
# Add translations and x-default
139153
add_translated_urls(url_entry, loc_text)
140154

141155
new_root.append(url_entry)
142156

143-
# Serialize XML to bytes with XML declaration, no pretty formatting
144-
serialized_xml = serialize_xml(new_root)
145-
146-
# Convert bytes to string and replace single quotes with double quotes in XML declaration
147-
serialized_xml_str = serialized_xml.decode('utf-8')
148-
if serialized_xml_str.startswith("<?xml"):
149-
# Replace single quotes with double quotes in the XML declaration only
150-
xml_declaration_end = serialized_xml_str.find("?>") + 2
151-
xml_declaration = serialized_xml_str[:xml_declaration_end]
152-
xml_declaration = xml_declaration.replace("'", '"')
153-
rest_of_xml = serialized_xml_str[xml_declaration_end:]
154-
serialized_xml_str = xml_declaration + rest_of_xml
155-
156-
# Remove any newline or carriage return characters to ensure single-line XML
157-
serialized_xml_str = serialized_xml_str.replace('\n', '').replace('\r', '')
158-
159-
# Write the serialized XML to file as text
157+
# Save prettified XML to file
158+
print("Generating prettified XML sitemap...")
159+
beautified_xml = prettify_xml(new_root)
160160
with open("sitemap.xml", "w", encoding="utf-8") as f:
161-
f.write(serialized_xml_str)
161+
f.write(beautified_xml)
162+
163+
print("sitemap.xml has been successfully generated.")
162164

163165
if __name__ == "__main__":
164166
main()

0 commit comments

Comments
 (0)