This is an automated email from the ASF dual-hosted git repository.

chia7712 pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/kafka.git


The following commit(s) were added to refs/heads/trunk by this push:
     new f47b9a4e00b KAFKA-20362 Resolve reviewer email via GitHub commit 
search API (#22135)
f47b9a4e00b is described below

commit f47b9a4e00b3f1dd88aafa59c27b5ad3d0388647
Author: Ming-Yen Chung <[email protected]>
AuthorDate: Wed Apr 29 01:04:51 2026 +0800

    KAFKA-20362 Resolve reviewer email via GitHub commit search API (#22135)
    
    The `resolve_reviewer` introduced in #22108 used local `git log` to find
    past `Reviewers:` trailers, but the PR linter runs with the default
    shallow checkout (`fetch-depth: 1`), so older merged PR trailers are not
    available on the runner.
    
    This PR switches reviewer email resolution to GitHub APIs in this order:
    
    1. the reviewer's latest apache/kafka commit author email
    2. `gh search commits` for prior `Reviewers:` trailers by display name,
       accepted only when the matched commit's associated PR has a review
       from the same GitHub login
    3. the reviewer's GitHub public profile email
    4. `Name (github:login)` fallback, without tagging the reviewer
    
    It also skips `pr-reviewed.yml` artifact creation once the PR is no
    longer open.
    
    For the motivating #22108 case, this resolves `@mimaison` as `Mickael
    Maison <[email protected]>` instead of falling back to the GitHub
    handle.
    
    `gh search commits` uses GitHub's Search API bucket. In CI it is
    authenticated via `GH_TOKEN`, so the limit is 30 requests/minute
    (unauthenticated is 10 requests/minute). This workflow normally resolves
    one reviewer per run. In local checks, `mjsax` and `mingyen066` resolved
    via T1 in 0.5-0.8s, while `mimaison` and `UladzislauBlok` resolved via
    T2's search-plus-review verification in about 3.0s.
    
    Reviewers: Chia-Ping Tsai <[email protected]>
---
 .github/scripts/pr-format.py      | 147 +++++++++++++++++++++-----------------
 .github/workflows/pr-reviewed.yml |   1 +
 2 files changed, 83 insertions(+), 65 deletions(-)

diff --git a/.github/scripts/pr-format.py b/.github/scripts/pr-format.py
index aab44e3e08d..e2ad6a292fd 100644
--- a/.github/scripts/pr-format.py
+++ b/.github/scripts/pr-format.py
@@ -107,86 +107,102 @@ def split_paragraphs(text: str):
 def resolve_reviewer(login: str) -> tuple:
     """Map a GitHub login to (name, email).
 
-    Tries three tiers in order: repo commit history, GitHub user profile,
-    and past `Reviewers:` trailers in git log (matched by name).
-    Noreply emails (@users.noreply.github.com) are treated as missing since
-    they are GitHub privacy placeholders that do not identify the reviewer.
-    Returns (name, None) when no usable email is found; the caller falls
-    back to the '(@login)' form in the Reviewers trailer.
+    Tries reviewer email sources in order: repo commit author email, past
+    `Reviewers:` trailers searched via GitHub commit search API (matched
+    by name and verified by PR review login), and GitHub user profile
+    public email. Noreply emails (@users.noreply.github.com) are treated
+    as missing since they are GitHub privacy placeholders that do not
+    identify the reviewer. Returns (name, None) when no usable email is
+    found; the caller falls back to the '(github:login)' form in the
+    Reviewers trailer.
     """
     def _usable_email(e):
         if not e or e.endswith("@users.noreply.github.com"):
             return None
         return e
 
-    name = None
-    email = None
-
-    # Tier 1: find from repo commit history. Misses when the reviewer has no
-    # merged commit in apache/kafka, or had "Keep my email private" enabled
-    # at commit time (GitHub rewrites the author to the noreply form).
-    try:
-        cmd = f"gh api repos/apache/kafka/commits?author={login}&per_page=1"
-        p = subprocess.run(shlex.split(cmd), capture_output=True, text=True)
-        if p.returncode == 0:
-            commits = json.loads(p.stdout)
-            if commits:
-                author = commits[0].get("commit", {}).get("author", {})
-                name = author.get("name")
-                email = _usable_email(author.get("email"))
-    except Exception as e:
-        logger.debug(f"Failed to resolve {login} from commit history: {e}")
-
-    # Tier 2: GitHub user profile. Only exposes an email when the reviewer
-    # has set a Public email in their profile settings.
-    if not name or not email:
-        try:
-            cmd = f"gh api users/{login}"
-            p = subprocess.run(shlex.split(cmd), capture_output=True, 
text=True)
-            if p.returncode == 0:
-                user = json.loads(p.stdout)
-                if not name:
-                    name = user.get("name")
-                if not email:
-                    email = _usable_email(user.get("email"))
-        except Exception as e:
-            logger.debug(f"Failed to resolve {login} from GitHub profile: {e}")
-
-    # Tier 3: past Reviewers: trailers in git log, matched by name. Catches
-    # pure reviewers (no commits in apache/kafka, no public profile email)
-    # who have been credited with a real email in an earlier merged PR.
-    # git log is newest-first, so the first usable match is the most recent.
-    if name and not email:
+    def _run_json(cmd, source):
         try:
-            p = subprocess.run(
-                ["git", "log",
-                 
"--pretty=format:%(trailers:key=Reviewers,valueonly=true,unfold=true)"],
-                capture_output=True, text=True,
-            )
+            p = subprocess.run(cmd, capture_output=True, text=True)
             if p.returncode == 0:
-                pattern = re.compile(rf"{re.escape(name)}\s*<([^>]+)>")
-                for line in p.stdout.splitlines():
-                    for m in pattern.finditer(line):
-                        candidate = _usable_email(m.group(1))
-                        if candidate:
-                            email = candidate
-                            break
-                    if email:
-                        break
+                return json.loads(p.stdout)
+            logger.debug(f"Failed to resolve {login} from {source}: 
{p.stderr}")
         except Exception as e:
-            logger.debug(f"Failed to resolve {login} from past Reviewers 
trailers: {e}")
-
-    if not name:
-        name = login
+            logger.debug(f"Failed to resolve {login} from {source}: {e}")
+        return None
 
-    return (name, email)
+    def _has_pr_review_from_login(commit_sha):
+        pulls = _run_json(["gh", "api", 
f"repos/apache/kafka/commits/{commit_sha}/pulls"],
+                          f"associated PRs for commit {commit_sha}") or []
+        for pull in pulls:
+            pr_number = pull.get("number")
+            if not pr_number:
+                continue
+            reviews = _run_json(["gh", "api", 
f"repos/apache/kafka/pulls/{pr_number}/reviews?per_page=100"],
+                                f"reviews for PR {pr_number}") or []
+            if any((review.get("user") or {}).get("login", "").lower() == 
login.lower()
+                   for review in reviews):
+                return True
+        return False
+
+    commits = _run_json(["gh", "api", 
f"repos/apache/kafka/commits?author={login}&per_page=1"],
+                        "commit history") or []
+    author = commits[0].get("commit", {}).get("author", {}) if commits else {}
+
+    # Tier 1: latest repo commit authored by this GitHub login. Misses
+    # when the reviewer has no merged commit in apache/kafka, or had
+    # "Keep my email private" enabled at commit time (GitHub rewrites
+    # the author to the noreply form).
+    email = _usable_email(author.get("email"))
+    if email:
+        return (author.get("name") or login, email)
+
+    user = _run_json(["gh", "api", f"users/{login}"], "GitHub profile") or {}
+
+    name_candidates = []
+    for candidate in (user.get("name"), author.get("name"), login):
+        if candidate and candidate not in name_candidates:
+            name_candidates.append(candidate)
+
+    name = name_candidates[0] if name_candidates else login
+
+    # Tier 2: past Reviewers: trailers in commit history, matched by name,
+    # via the GitHub commit search API. Catches pure reviewers (no commits
+    # in apache/kafka, no public profile email) who have been credited
+    # with a real email in an earlier merged PR. Sort by committer-date
+    # desc so the most recent email wins if a reviewer has changed it.
+    # Full-text search is tokenized (not strict substring), so we re-verify
+    # with a regex client-side. To avoid same-name matches, we only accept
+    # a trailer email when the matched commit's associated PR includes a
+    # review from this GitHub login.
+    for candidate in name_candidates:
+        results = _run_json(["gh", "search", "commits",
+                             "--repo", "apache/kafka",
+                             f'"{candidate} <"',
+                             "--limit", "10",
+                             "--sort", "committer-date",
+                             "--order", "desc",
+                             "--json", "sha,commit"],
+                            "commit search") or []
+        pattern = re.compile(rf"{re.escape(candidate)}\s*<([^>]+)>")
+        for result in results:
+            msg = result.get("commit", {}).get("message", "")
+            commit_sha = result.get("sha")
+            for match in pattern.finditer(msg):
+                candidate_email = _usable_email(match.group(1))
+                if candidate_email and commit_sha and 
_has_pr_review_from_login(commit_sha):
+                    return (candidate, candidate_email)
+
+    # Tier 3: GitHub user profile. Only exposes an email when the reviewer
+    # has set a Public email in their profile settings.
+    return (name, _usable_email(user.get("email")))
 
 
 def already_exists(identity: str, existing_reviewers: List[str]) -> bool:
     """Check if a reviewer identity is already in the existing reviewers list.
 
     identity is the delimited token that uniquely identifies a reviewer, either
-    '<email>' (for the email form) or '(@login)' (for the login fallback).
+    '<email>' (for the email form) or '(github:login)' (for the login 
fallback).
     """
     return identity.lower() in ", ".join(existing_reviewers).lower()
 
@@ -246,7 +262,8 @@ if __name__ == "__main__":
         if email:
             identity = f"<{email}>"
         else:
-            identity = f"(@{reviewer_login})"
+            # Tier 4: fall back to the GitHub handle without tagging the 
reviewer.
+            identity = f"(github:{reviewer_login})"
         resolved = f"{name} {identity}"
         existing_reviewers = parse_trailers(title, body).get("Reviewers", [])
         if not already_exists(identity, existing_reviewers):
diff --git a/.github/workflows/pr-reviewed.yml 
b/.github/workflows/pr-reviewed.yml
index edacae2ff29..316c9c6b92c 100644
--- a/.github/workflows/pr-reviewed.yml
+++ b/.github/workflows/pr-reviewed.yml
@@ -29,6 +29,7 @@ jobs:
   save-pr-number:
     name: Save PR Number
     runs-on: ubuntu-latest
+    if: github.event.pull_request.state == 'open'
     steps:
       - name: Env
         run: printenv

Reply via email to