dweiss commented on code in PR #15195: URL: https://github.com/apache/lucene/pull/15195#discussion_r2354658742
########## build-tools/build-infra/src/main/java/org/apache/lucene/gradle/plugins/licenses/CheckLicensesTask.java: ########## @@ -0,0 +1,297 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.gradle.plugins.licenses; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; +import java.util.function.Predicate; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import org.gradle.api.DefaultTask; +import org.gradle.api.GradleException; +import org.gradle.api.file.ConfigurableFileCollection; +import org.gradle.api.file.RegularFileProperty; +import org.gradle.api.provider.Property; +import org.gradle.api.tasks.CacheableTask; +import org.gradle.api.tasks.Input; +import org.gradle.api.tasks.InputFiles; +import org.gradle.api.tasks.OutputFile; +import org.gradle.api.tasks.PathSensitive; +import org.gradle.api.tasks.PathSensitivity; +import org.gradle.api.tasks.TaskAction; +import org.gradle.work.FileChange; +import org.gradle.work.Incremental; +import org.gradle.work.InputChanges; + +/** + * This task takes a set of files as input and verifies if their header contains any of the known + * license patterns. Files that don't have any permitted licenses will trigger an error. + */ +@CacheableTask +public abstract class CheckLicensesTask extends DefaultTask { + /** The default number of leading characters scanned in each file. */ + private static final Integer DEFAULT_SCANNED_HEADER = 1024; + + private record LicenseFamily(String code, String name, Predicate<String> matcherPredicate) {} + + static final List<LicenseFamily> LUCENE_ACCEPTED_LICENSES = Review Comment: This is the list of allowed licenses and their fixed-string detection patterns. ########## build-tools/build-infra/src/main/java/org/apache/lucene/gradle/plugins/licenses/CheckLicensesTask.java: ########## @@ -0,0 +1,297 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.gradle.plugins.licenses; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; +import java.util.function.Predicate; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import org.gradle.api.DefaultTask; +import org.gradle.api.GradleException; +import org.gradle.api.file.ConfigurableFileCollection; +import org.gradle.api.file.RegularFileProperty; +import org.gradle.api.provider.Property; +import org.gradle.api.tasks.CacheableTask; +import org.gradle.api.tasks.Input; +import org.gradle.api.tasks.InputFiles; +import org.gradle.api.tasks.OutputFile; +import org.gradle.api.tasks.PathSensitive; +import org.gradle.api.tasks.PathSensitivity; +import org.gradle.api.tasks.TaskAction; +import org.gradle.work.FileChange; +import org.gradle.work.Incremental; +import org.gradle.work.InputChanges; + +/** + * This task takes a set of files as input and verifies if their header contains any of the known + * license patterns. Files that don't have any permitted licenses will trigger an error. + */ +@CacheableTask +public abstract class CheckLicensesTask extends DefaultTask { + /** The default number of leading characters scanned in each file. */ + private static final Integer DEFAULT_SCANNED_HEADER = 1024; + + private record LicenseFamily(String code, String name, Predicate<String> matcherPredicate) {} + + static final List<LicenseFamily> LUCENE_ACCEPTED_LICENSES = + List.of( + new LicenseFamily( + "ASL", + "Apache Software License 2.0", + fixedSubstrings( + "http://www.apache.org/licenses/LICENSE-2.0", + "https://www.apache.org/licenses/LICENSE-2.0")), + new LicenseFamily( + "MIT", + "The MIT License", + // ICU license (ScriptIterator.java) + fixedSubstring( + "Permission is hereby granted, free of charge, to any person obtaining a copy") + .and( + fixedSubstring( + "THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR"))), + new LicenseFamily( + "BSD", + "Modified BSD License", + fixedSubstrings( + // brics automaton + "Copyright (c) 2001-2009 Anders Moeller", + // snowball + "Copyright (c) 2001, Dr Martin Porter", + // UMASS kstem + "THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF MASSACHUSETTS AND OTHER CONTRIBUTORS", + // Egothor + "Egothor Software License version 1.00", + // JaSpell + "Copyright (c) 2005 Bruno Martins", + // lz4 + "Copyright (c) 2011-2016, Yann Collet")), + new LicenseFamily( + "GENERATED", + "Generated files", + fixedSubstrings( + // snowball + "Generated by Snowball", + // javacc + "Generated By:JavaCC"))); + + /** The number of leading characters scanned in each file. */ + @Input + public abstract Property<Integer> getMaxScannedHeaderSize(); + + @InputFiles + @Incremental + @PathSensitive(PathSensitivity.RELATIVE) + public abstract ConfigurableFileCollection getFiles(); + + /** + * An output report file to keep up-to-date checks sane. The output file is a sorted list of all + * input files and their license codes. + */ + @OutputFile + public abstract RegularFileProperty getReportFile(); + + public CheckLicensesTask() { + getMaxScannedHeaderSize().convention(DEFAULT_SCANNED_HEADER); + } + + @TaskAction + public void run(InputChanges changes) throws IOException { + getLogger() + .info("Checking licenses {}", changes.isIncremental() ? "(incremental run)" : "(full run)"); + + // load the current report (if any) into a sorted map + File reportFile = getReportFile().getAsFile().get(); + TreeMap<String, String> report = readExistingReport(reportFile); + + // update the report with changes. + List<File> missingLicense = new ArrayList<>(); + char[] scratch = new char[1024]; + StringBuilder buffer = new StringBuilder(); + int count = 0; + for (FileChange fc : changes.getFileChanges(getFiles())) { + count++; + File file = fc.getFile(); + if (file.isDirectory()) { + continue; + } + + String key = toRootRelative(file); + + switch (fc.getChangeType()) { + case REMOVED -> report.remove(key); + case ADDED, MODIFIED -> { + LicenseFamily licenseFamily = detectLicense(file, buffer, scratch); + if (licenseFamily == null) { + missingLicense.add(file); + } else { + report.put(key, licenseFamily.code); + } + } + default -> throw new IOException("Unexpected change type: " + fc.getChangeType()); + } + } + + getLogger().info("Checked {} {}", count, count == 1 ? "file" : "files"); + + if (!missingLicense.isEmpty()) { + throw new GradleException( + String.format( + Locale.ROOT, + "The following files have no (acceptable) license header or the license header is not within the first %s header characters:\n%s", + getMaxScannedHeaderSize().get(), + missingLicense.stream() + .map(file -> " - " + toRootRelative(file)) + .collect(Collectors.joining("\n")))); + } + + // Also, in case the input set of files has itself changed, ensure the report only + // contains files that still exist under the current sources. + Set<String> current = new HashSet<>(); + for (File f : getFiles().getFiles()) { + if (f.isFile()) { + current.add(toRootRelative(f)); + } + } + report.keySet().removeIf(k -> !current.contains(k)); + + var logger = getLogger(); + if (logger.isInfoEnabled()) { + var counts = + report.entrySet().stream() + .collect(Collectors.groupingBy(Map.Entry::getValue, Collectors.counting())); + logger.info( + "License type counts:\n{}", + counts.entrySet().stream() + .sorted((a, b) -> Long.compare(b.getValue().longValue(), a.getValue().longValue())) + .map( + e -> + String.format( + Locale.ROOT, + " - %s: %,d %s", + e.getKey(), + e.getValue(), + e.getValue() == 1 ? "file" : "files")) + .collect(Collectors.joining("\n"))); + } + + writeReport(report, reportFile); + } + + private TreeMap<String, String> readExistingReport(File reportFile) throws IOException { + TreeMap<String, String> report = new TreeMap<>(); + if (!reportFile.exists()) { + return report; + } + + try (var lineStream = Files.lines(reportFile.toPath())) { + lineStream.forEach( + line -> { + int idx = line.lastIndexOf(':'); + String path = line.substring(0, idx).trim(); + String state = line.substring(idx + 1).trim(); + if (!path.isEmpty() && !state.isEmpty()) { + report.put(path, state); + } + }); + } + + return report; + } + + private void writeReport(TreeMap<String, String> report, File reportFile) throws IOException { + Files.createDirectories(reportFile.toPath().getParent()); + + try (BufferedWriter bw = Files.newBufferedWriter(reportFile.toPath())) { + for (Map.Entry<String, String> e : report.entrySet()) { + bw.write(e.getKey()); + bw.write(": "); + bw.write(e.getValue()); + bw.write("\n"); + } + } + } + + /** Convert the given path to a root-project-relative path with unix path separators. */ + private String toRootRelative(File file) { + Path root = getProject().getRootDir().toPath().normalize(); + Path p = file.toPath().toAbsolutePath().normalize(); + String key = root.relativize(p).toString(); + if (File.separatorChar != '/') { + key = key.replace(File.separatorChar, '/'); + } + return key; + } + + private LicenseFamily detectLicense(File file, StringBuilder buffer, char[] scratch) + throws IOException { + // I assume all files are in UTF8... This is verified elsewhere (eclint). Review Comment: I assume all files are in UTF8... This is verified elsewhere (eclint) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
