1
1
import requests
2
2
import xml .etree .ElementTree as ET
3
+ from xml .dom import minidom
3
4
from tqdm import tqdm
4
5
from urllib .parse import quote
5
6
@@ -34,12 +35,14 @@ def fetch_sitemap(url):
34
35
response .raise_for_status ()
35
36
return response .text
36
37
37
- def serialize_xml (element ):
38
- """
39
- Serialize the XML Element to a bytes object with XML declaration,
40
- without any additional formatting (all content in one line).
41
- """
42
- return ET .tostring (element , encoding = 'utf-8' , xml_declaration = True )
38
+ def prettify_xml (element ):
39
+ """Prettify and return a string representation of the XML with XML declaration including encoding."""
40
+ rough_string = ET .tostring (element , encoding = 'utf-8' )
41
+ reparsed = minidom .parseString (rough_string )
42
+ # Specify encoding to include it in the XML declaration
43
+ pretty = reparsed .toprettyxml (indent = " " , encoding = "UTF-8" )
44
+ # Decode bytes to string for writing to file
45
+ return pretty .decode ('UTF-8' )
43
46
44
47
def encode_url (url ):
45
48
"""Encode the URL to make it XML-safe and RFC-compliant."""
@@ -54,7 +57,14 @@ def add_static_urls_without_translations(root, urls):
54
57
root .append (url_element )
55
58
56
59
def add_translated_urls (url_element , original_url ):
57
- """Add translated URLs with language codes appended to the path."""
60
+ """Add translated URLs with language codes appended to the path, including x-default."""
61
+ # Add x-default hreflang pointing to the original URL
62
+ alt_link_default = ET .SubElement (url_element , '{http://www.w3.org/1999/xhtml}link' )
63
+ alt_link_default .set ('rel' , 'alternate' )
64
+ alt_link_default .set ('hreflang' , 'x-default' )
65
+ alt_link_default .set ('href' , encode_url (original_url ))
66
+
67
+ # Add hreflang links for each language
58
68
for hreflang , lang_code in languages .items ():
59
69
# Add the language code to the path
60
70
path_parts = original_url .split ('/' , 3 )
@@ -75,11 +85,13 @@ def main():
75
85
cloud_sitemap_url = "https://cloud.hacktricks.xyz/sitemap.xml"
76
86
77
87
# Fetch both sitemaps
88
+ print ("Fetching sitemaps..." )
78
89
book_sitemap_data = fetch_sitemap (book_sitemap_url )
79
90
cloud_sitemap_data = fetch_sitemap (cloud_sitemap_url )
80
91
81
92
# Parse XML
82
93
ns = {'ns' : 'http://www.sitemaps.org/schemas/sitemap/0.9' }
94
+ print ("Parsing sitemaps..." )
83
95
book_root = ET .fromstring (book_sitemap_data )
84
96
cloud_root = ET .fromstring (cloud_sitemap_data )
85
97
@@ -106,9 +118,11 @@ def main():
106
118
"https://training.hacktricks.xyz/terms" ,
107
119
"https://training.hacktricks.xyz/privacy" ,
108
120
]
121
+ print ("Adding static URLs without translations..." )
109
122
add_static_urls_without_translations (new_root , static_training_urls )
110
123
111
124
# Process main URLs from book and cloud hacktricks sitemaps
125
+ print ("Processing main URLs with translations..." )
112
126
for url_element in tqdm (all_urls , desc = "Processing URLs" ):
113
127
loc = url_element .find ('ns:loc' , ns )
114
128
if loc is None :
@@ -135,30 +149,18 @@ def main():
135
149
lastmod_el = ET .SubElement (url_entry , '{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod' )
136
150
lastmod_el .text = lastmod .text
137
151
138
- # Add translations
152
+ # Add translations and x-default
139
153
add_translated_urls (url_entry , loc_text )
140
154
141
155
new_root .append (url_entry )
142
156
143
- # Serialize XML to bytes with XML declaration, no pretty formatting
144
- serialized_xml = serialize_xml (new_root )
145
-
146
- # Convert bytes to string and replace single quotes with double quotes in XML declaration
147
- serialized_xml_str = serialized_xml .decode ('utf-8' )
148
- if serialized_xml_str .startswith ("<?xml" ):
149
- # Replace single quotes with double quotes in the XML declaration only
150
- xml_declaration_end = serialized_xml_str .find ("?>" ) + 2
151
- xml_declaration = serialized_xml_str [:xml_declaration_end ]
152
- xml_declaration = xml_declaration .replace ("'" , '"' )
153
- rest_of_xml = serialized_xml_str [xml_declaration_end :]
154
- serialized_xml_str = xml_declaration + rest_of_xml
155
-
156
- # Remove any newline or carriage return characters to ensure single-line XML
157
- serialized_xml_str = serialized_xml_str .replace ('\n ' , '' ).replace ('\r ' , '' )
158
-
159
- # Write the serialized XML to file as text
157
+ # Save prettified XML to file
158
+ print ("Generating prettified XML sitemap..." )
159
+ beautified_xml = prettify_xml (new_root )
160
160
with open ("sitemap.xml" , "w" , encoding = "utf-8" ) as f :
161
- f .write (serialized_xml_str )
161
+ f .write (beautified_xml )
162
+
163
+ print ("sitemap.xml has been successfully generated." )
162
164
163
165
if __name__ == "__main__" :
164
166
main ()
0 commit comments