This is an automated email from the ASF dual-hosted git repository. kassiez pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris-website.git
The following commit(s) were added to refs/heads/master by this push: new b983f9afe2d update check tools (#2064) b983f9afe2d is described below commit b983f9afe2d7624ebd13dd4e37c6e2bda988e793 Author: wangtianyi2004 <376612...@qq.com> AuthorDate: Tue Feb 18 11:10:48 2025 +0800 update check tools (#2064) ## Versions - [ ] dev - [ ] 3.0 - [ ] 2.1 - [ ] 2.0 ## Languages - [ ] Chinese - [ ] English ## Docs Checklist - [ ] Checked by AI - [ ] Test Cases Built --- modify-deadlink.py | 140 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+) diff --git a/modify-deadlink.py b/modify-deadlink.py new file mode 100644 index 00000000000..a2d48f98c05 --- /dev/null +++ b/modify-deadlink.py @@ -0,0 +1,140 @@ +import os +import sys +import re +from collections import namedtuple + +# Define a structure to store information, added the 'sed_str' field +FileInfo = namedtuple('FileInfo', ['target_file', 'url_line', 'url_path', 'url_count', 'relative_url', 'log_error', 'origin_url', 'sed_str']) + +def find_file(file_str, search_dir, line_content): + # Initialize result list + results = [] + + # Extract the second file path (including the line number) + match = re.search(r"in file '([^']+)'", file_str) + if match: + base_file = match.group(1) # For example: "versioned_docs/version-3.0/sql-manual/sql-data-types/data-type-overview.md:67" + parts = base_file.split(":") + base_file_path = parts[0] # Remove the line number part to get the file path + line_number = parts[1] if len(parts) > 1 else "" # The part after the colon + + # Get the root directory of the second file path + root_dir = os.path.dirname(base_file_path) + + # Extract the first file path based on the 'link' in the log line + match = re.search(r"link '([^']+)'", file_str) # Extract the path after 'link' + if match: + filename = match.group(1) + # Get the base file name (remove the path part) + file_base_name = os.path.basename(filename) + # Create the target file name, check if it already has a .md extension + if not file_base_name.endswith(".md"): + target_filename = f"{file_base_name}.md" + else: + target_filename = file_base_name + + # Check if the file exists in the directory and count the number of occurrences + found_files = [] + for root, dirs, files in os.walk(search_dir): + if target_filename in files: + file_path = os.path.join(root, target_filename) + found_files.append(file_path) + + # Store the result in the structure array + if found_files: + url_count = 0 + relative_url = "" + for file in found_files: + # Calculate the relative file path + url_path = os.path.relpath(file, os.getcwd()) + url_count += 1 + + # If only one URL is found, output the relative path from the file directory + if url_count == 1: + relative_url = os.path.relpath(found_files[0], os.path.dirname(base_file_path)) + + # Handle relative_url, if it doesn't start with '../', prepend './', and remove the .md suffix + if not relative_url.startswith("../"): + relative_url = "./" + relative_url + if relative_url.endswith(".md"): + relative_url = relative_url[:-3] + + # Extract the origin_url (from log_error, extracting the path after 'link' in quotes) + origin_url_match = re.search(r"link '([^']+)'", line_content) # Find the content following 'link' + origin_url = origin_url_match.group(1) if origin_url_match else "" + + # Create the sed_str command (valid only when url_count is 1) + sed_str = "" + if url_count == 1: + sed_str = f"sed -i '{line_number}s|({origin_url})|({relative_url})|' {base_file_path}" + + # Store the result in the structure array + file_info = FileInfo( + target_file=base_file_path, + url_line=line_number, + url_path=url_path, + url_count=url_count, + relative_url=relative_url, + log_error=line_content, # Store the current line content + origin_url=origin_url, # Store origin_url + sed_str=sed_str # Store sed command + ) + results.append(file_info) + + else: + print(f"[ERR] No file named {target_filename} found in {search_dir}.") + print(f"[ERR] Error log: {line_content}") # Output the current error log + print("-" * 80) # Print the separator line + else: + print(f"No valid file path found in the input string.") + print(f"Error log: {line_content}") # Output the current error log + print("-" * 80) # Print the separator line + else: + print(f"No valid base file path found in the input string.") + print(f"Error log: {line_content}") # Output the current error log + print("-" * 80) # Print the separator line + + return results + +# New function: Read the file and call find_file +def get_deadlink(file_path, search_dir): + results = [] + if os.path.isfile(file_path): # Check if it's a valid file + with open(file_path, 'r') as file: + for line in file: + line = line.strip() # Remove possible spaces and newline characters + # Call find_file for each line and pass the current line content + results.extend(find_file(line, search_dir, line)) # Append the result of each line to the results list + else: + print(f"{file_path} is not a valid file.") # Print if the file is invalid + + return results + +# Print the results from the structure array +def print_results(results): + for result in results: + print(f"[LOG] target_file >> {result.target_file}") + print(f"[LOG] url_line >> {result.url_line}") + print(f"[LOG] url_path >> {result.url_path}") + print(f"[LOG] url_count >> {result.url_count}") + print(f"[LOG] relative_url >> {result.relative_url}") + print(f"[LOG] log_error >> {result.log_error}") # Print log_error + print(f"[LOG] origin_url >> {result.origin_url}") # Print origin_url + print(f"[LOG] sed_str >> {result.sed_str}") # Print sed_str + print("----------------------------------------------------------------") + +if __name__ == "__main__": + # Get input arguments + if len(sys.argv) != 3: + print("Usage: python find_file.py '<file_with_logs>' <search_dir>") # Print usage message + sys.exit(1) + + file_with_logs = sys.argv[1] # Get the file path + search_dir = sys.argv[2] # Get the search directory + + # Process the file and get results + results = get_deadlink(file_with_logs, search_dir) + + # Print the results from the structure array + print_results(results) + --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org