This is an automated email from the ASF dual-hosted git repository.

kassiez pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris-website.git


The following commit(s) were added to refs/heads/master by this push:
     new b983f9afe2d update check tools (#2064)
b983f9afe2d is described below

commit b983f9afe2d7624ebd13dd4e37c6e2bda988e793
Author: wangtianyi2004 <376612...@qq.com>
AuthorDate: Tue Feb 18 11:10:48 2025 +0800

    update check tools (#2064)
    
    ## Versions
    
    - [ ] dev
    - [ ] 3.0
    - [ ] 2.1
    - [ ] 2.0
    
    ## Languages
    
    - [ ] Chinese
    - [ ] English
    
    ## Docs Checklist
    
    - [ ] Checked by AI
    - [ ] Test Cases Built
---
 modify-deadlink.py | 140 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 140 insertions(+)

diff --git a/modify-deadlink.py b/modify-deadlink.py
new file mode 100644
index 00000000000..a2d48f98c05
--- /dev/null
+++ b/modify-deadlink.py
@@ -0,0 +1,140 @@
+import os
+import sys
+import re
+from collections import namedtuple
+
+# Define a structure to store information, added the 'sed_str' field
+FileInfo = namedtuple('FileInfo', ['target_file', 'url_line', 'url_path', 
'url_count', 'relative_url', 'log_error', 'origin_url', 'sed_str'])
+
+def find_file(file_str, search_dir, line_content):
+    # Initialize result list
+    results = []
+
+    # Extract the second file path (including the line number)
+    match = re.search(r"in file '([^']+)'", file_str)
+    if match:
+        base_file = match.group(1)  # For example: 
"versioned_docs/version-3.0/sql-manual/sql-data-types/data-type-overview.md:67"
+        parts = base_file.split(":")
+        base_file_path = parts[0]         # Remove the line number part to get 
the file path
+        line_number = parts[1] if len(parts) > 1 else ""  # The part after the 
colon
+
+        # Get the root directory of the second file path
+        root_dir = os.path.dirname(base_file_path)
+
+        # Extract the first file path based on the 'link' in the log line
+        match = re.search(r"link '([^']+)'", file_str)  # Extract the path 
after 'link'
+        if match:
+            filename = match.group(1)
+            # Get the base file name (remove the path part)
+            file_base_name = os.path.basename(filename)
+            # Create the target file name, check if it already has a .md 
extension
+            if not file_base_name.endswith(".md"):
+                target_filename = f"{file_base_name}.md"
+            else:
+                target_filename = file_base_name
+
+            # Check if the file exists in the directory and count the number 
of occurrences
+            found_files = []
+            for root, dirs, files in os.walk(search_dir):
+                if target_filename in files:
+                    file_path = os.path.join(root, target_filename)
+                    found_files.append(file_path)
+
+            # Store the result in the structure array
+            if found_files:
+                url_count = 0
+                relative_url = ""
+                for file in found_files:
+                    # Calculate the relative file path
+                    url_path = os.path.relpath(file, os.getcwd())
+                    url_count += 1
+
+                # If only one URL is found, output the relative path from the 
file directory
+                if url_count == 1:
+                    relative_url = os.path.relpath(found_files[0], 
os.path.dirname(base_file_path))
+
+                    # Handle relative_url, if it doesn't start with '../', 
prepend './', and remove the .md suffix
+                    if not relative_url.startswith("../"):
+                        relative_url = "./" + relative_url
+                    if relative_url.endswith(".md"):
+                        relative_url = relative_url[:-3]
+
+                # Extract the origin_url (from log_error, extracting the path 
after 'link' in quotes)
+                origin_url_match = re.search(r"link '([^']+)'", line_content)  
# Find the content following 'link'
+                origin_url = origin_url_match.group(1) if origin_url_match 
else ""
+
+                # Create the sed_str command (valid only when url_count is 1)
+                sed_str = ""
+                if url_count == 1:
+                    sed_str = f"sed -i 
'{line_number}s|({origin_url})|({relative_url})|' {base_file_path}"
+
+                # Store the result in the structure array
+                file_info = FileInfo(
+                    target_file=base_file_path,
+                    url_line=line_number,
+                    url_path=url_path,
+                    url_count=url_count,
+                    relative_url=relative_url,
+                    log_error=line_content,  # Store the current line content
+                    origin_url=origin_url,   # Store origin_url
+                    sed_str=sed_str          # Store sed command
+                )
+                results.append(file_info)
+
+            else:
+                print(f"[ERR] No file named {target_filename} found in 
{search_dir}.")
+                print(f"[ERR] Error log: {line_content}")  # Output the 
current error log
+                print("-" * 80)  # Print the separator line
+        else:
+            print(f"No valid file path found in the input string.")
+            print(f"Error log: {line_content}")  # Output the current error log
+            print("-" * 80)  # Print the separator line
+    else:
+        print(f"No valid base file path found in the input string.")
+        print(f"Error log: {line_content}")  # Output the current error log
+        print("-" * 80)  # Print the separator line
+
+    return results
+
+# New function: Read the file and call find_file
+def get_deadlink(file_path, search_dir):
+    results = []
+    if os.path.isfile(file_path):  # Check if it's a valid file
+        with open(file_path, 'r') as file:
+            for line in file:
+                line = line.strip()  # Remove possible spaces and newline 
characters
+                # Call find_file for each line and pass the current line 
content
+                results.extend(find_file(line, search_dir, line))  # Append 
the result of each line to the results list
+    else:
+        print(f"{file_path} is not a valid file.")  # Print if the file is 
invalid
+
+    return results
+
+# Print the results from the structure array
+def print_results(results):
+    for result in results:
+        print(f"[LOG] target_file >> {result.target_file}")
+        print(f"[LOG] url_line >> {result.url_line}")
+        print(f"[LOG] url_path >> {result.url_path}")
+        print(f"[LOG] url_count >> {result.url_count}")
+        print(f"[LOG] relative_url >> {result.relative_url}")
+        print(f"[LOG] log_error >> {result.log_error}")  # Print log_error
+        print(f"[LOG] origin_url >> {result.origin_url}")  # Print origin_url
+        print(f"[LOG] sed_str >> {result.sed_str}")  # Print sed_str
+        
print("----------------------------------------------------------------")
+
+if __name__ == "__main__":
+    # Get input arguments
+    if len(sys.argv) != 3:
+        print("Usage: python find_file.py '<file_with_logs>' <search_dir>")  # 
Print usage message
+        sys.exit(1)
+
+    file_with_logs = sys.argv[1]  # Get the file path
+    search_dir = sys.argv[2]  # Get the search directory
+
+    # Process the file and get results
+    results = get_deadlink(file_with_logs, search_dir)
+
+    # Print the results from the structure array
+    print_results(results)
+


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to