Repository: commons-text Updated Branches: refs/heads/myers-algo [created] a7e88eef1
Migrating Myers algorithm from [collections] Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/cca1f199 Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/cca1f199 Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/cca1f199 Branch: refs/heads/myers-algo Commit: cca1f19918760d4e538ff4d70d875878418f6f1a Parents: 37c7804 Author: Bruno P. Kinoshita <ki...@apache.org> Authored: Sun Dec 14 23:41:43 2014 -0200 Committer: Bruno P. Kinoshita <ki...@apache.org> Committed: Sun Dec 14 23:41:43 2014 -0200 ---------------------------------------------------------------------- .../commons/text/diff/CommandVisitor.java | 143 ++++++++++ .../apache/commons/text/diff/DeleteCommand.java | 55 ++++ .../apache/commons/text/diff/EditCommand.java | 82 ++++++ .../apache/commons/text/diff/EditScript.java | 133 +++++++++ .../apache/commons/text/diff/InsertCommand.java | 57 ++++ .../apache/commons/text/diff/KeepCommand.java | 57 ++++ .../commons/text/diff/ReplacementsFinder.java | 109 +++++++ .../commons/text/diff/ReplacementsHandler.java | 52 ++++ .../commons/text/diff/StringsComparator.java | 281 +++++++++++++++++++ .../apache/commons/text/diff/package-info.java | 22 ++ 10 files changed, 991 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/commons-text/blob/cca1f199/src/main/java/org/apache/commons/text/diff/CommandVisitor.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/diff/CommandVisitor.java b/src/main/java/org/apache/commons/text/diff/CommandVisitor.java new file mode 100644 index 0000000..ae3833c --- /dev/null +++ b/src/main/java/org/apache/commons/text/diff/CommandVisitor.java @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.diff; + +/** + * This interface should be implemented by user object to walk + * through {@link EditScript EditScript} objects. + * <p> + * Users should implement this interface in order to walk through + * the {@link EditScript EditScript} object created by the comparison + * of two sequences. This is a direct application of the visitor + * design pattern. The {@link EditScript#visit EditScript.visit} + * method takes an object implementing this interface as an argument, + * it will perform the loop over all commands in the script and the + * proper methods of the user class will be called as the commands are + * encountered. + * <p> + * The implementation of the user visitor class will depend on the + * need. Here are two examples. + * <p> + * The first example is a visitor that build the longest common + * subsequence: + * <pre> + * import org.apache.commons.collections4.comparators.sequence.CommandVisitor; + * + * import java.util.ArrayList; + * + * public class LongestCommonSubSequence implements CommandVisitor { + * + * public LongestCommonSubSequence() { + * a = new ArrayList(); + * } + * + * public void visitInsertCommand(Object object) { + * } + * + * public void visitKeepCommand(Object object) { + * a.add(object); + * } + * + * public void visitDeleteCommand(Object object) { + * } + * + * public Object[] getSubSequence() { + * return a.toArray(); + * } + * + * private ArrayList a; + * + * } + * </pre> + * <p> + * The second example is a visitor that shows the commands and the way + * they transform the first sequence into the second one: + * <pre> + * import org.apache.commons.collections4.comparators.sequence.CommandVisitor; + * + * import java.util.Arrays; + * import java.util.ArrayList; + * import java.util.Iterator; + * + * public class ShowVisitor implements CommandVisitor { + * + * public ShowVisitor(Object[] sequence1) { + * v = new ArrayList(); + * v.addAll(Arrays.asList(sequence1)); + * index = 0; + * } + * + * public void visitInsertCommand(Object object) { + * v.insertElementAt(object, index++); + * display("insert", object); + * } + * + * public void visitKeepCommand(Object object) { + * ++index; + * display("keep ", object); + * } + * + * public void visitDeleteCommand(Object object) { + * v.remove(index); + * display("delete", object); + * } + * + * private void display(String commandName, Object object) { + * System.out.println(commandName + " " + object + " ->" + this); + * } + * + * public String toString() { + * StringBuffer buffer = new StringBuffer(); + * for (Iterator iter = v.iterator(); iter.hasNext();) { + * buffer.append(' ').append(iter.next()); + * } + * return buffer.toString(); + * } + * + * private ArrayList v; + * private int index; + * + * } + * </pre> + * + * @since 4.0 + * @version $Id: CommandVisitor.java 1477760 2013-04-30 18:34:03Z tn $ + */ +public interface CommandVisitor<T> { + + /** + * Method called when an insert command is encountered. + * + * @param object object to insert (this object comes from the second sequence) + */ + void visitInsertCommand(T object); + + /** + * Method called when a keep command is encountered. + * + * @param object object to keep (this object comes from the first sequence) + */ + void visitKeepCommand(T object); + + /** + * Method called when a delete command is encountered. + * + * @param object object to delete (this object comes from the first sequence) + */ + void visitDeleteCommand(T object); + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/cca1f199/src/main/java/org/apache/commons/text/diff/DeleteCommand.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/diff/DeleteCommand.java b/src/main/java/org/apache/commons/text/diff/DeleteCommand.java new file mode 100644 index 0000000..0de3a43 --- /dev/null +++ b/src/main/java/org/apache/commons/text/diff/DeleteCommand.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.diff; + +/** + * Command representing the deletion of one object of the first sequence. + * <p> + * When one object of the first sequence has no corresponding object in the + * second sequence at the right place, the {@link EditScript edit script} + * transforming the first sequence into the second sequence uses an instance of + * this class to represent the deletion of this object. The objects embedded in + * these type of commands always come from the first sequence. + * + * @see SequencesComparator + * @see EditScript + * + * @since 4.0 + * @version $Id: DeleteCommand.java 1477760 2013-04-30 18:34:03Z tn $ + */ +public class DeleteCommand<T> extends EditCommand<T> { + + /** + * Simple constructor. Creates a new instance of {@link DeleteCommand}. + * + * @param object the object of the first sequence that should be deleted + */ + public DeleteCommand(final T object) { + super(object); + } + + /** + * Accept a visitor. When a <code>DeleteCommand</code> accepts a visitor, it calls + * its {@link CommandVisitor#visitDeleteCommand visitDeleteCommand} method. + * + * @param visitor the visitor to be accepted + */ + @Override + public void accept(final CommandVisitor<T> visitor) { + visitor.visitDeleteCommand(getObject()); + } +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/cca1f199/src/main/java/org/apache/commons/text/diff/EditCommand.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/diff/EditCommand.java b/src/main/java/org/apache/commons/text/diff/EditCommand.java new file mode 100644 index 0000000..7c5ff76 --- /dev/null +++ b/src/main/java/org/apache/commons/text/diff/EditCommand.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.diff; + +/** + * Abstract base class for all commands used to transform an objects sequence + * into another one. + * <p> + * When two objects sequences are compared through the + * {@link SequencesComparator#getScript SequencesComparator.getScript} method, + * the result is provided has a {@link EditScript script} containing the commands + * that progressively transform the first sequence into the second one. + * <p> + * There are only three types of commands, all of which are subclasses of this + * abstract class. Each command is associated with one object belonging to at + * least one of the sequences. These commands are {@link InsertCommand + * InsertCommand} which correspond to an object of the second sequence being + * inserted into the first sequence, {@link DeleteCommand DeleteCommand} which + * correspond to an object of the first sequence being removed and + * {@link KeepCommand KeepCommand} which correspond to an object of the first + * sequence which <code>equals</code> an object in the second sequence. It is + * guaranteed that comparison is always performed this way (i.e. the + * <code>equals</code> method of the object from the first sequence is used and + * the object passed as an argument comes from the second sequence) ; this can + * be important if subclassing is used for some elements in the first sequence + * and the <code>equals</code> method is specialized. + * + * @see SequencesComparator + * @see EditScript + * + * @since 4.0 + * @version $Id: EditCommand.java 1477760 2013-04-30 18:34:03Z tn $ + */ +public abstract class EditCommand<T> { + + /** Object on which the command should be applied. */ + private final T object; + + /** + * Simple constructor. Creates a new instance of EditCommand + * + * @param object reference to the object associated with this command, this + * refers to an element of one of the sequences being compared + */ + protected EditCommand(final T object) { + this.object = object; + } + + /** + * Returns the object associated with this command. + * + * @return the object on which the command is applied + */ + protected T getObject() { + return object; + } + + /** + * Accept a visitor. + * <p> + * This method is invoked for each commands belonging to + * an {@link EditScript EditScript}, in order to implement the visitor design pattern + * + * @param visitor the visitor to be accepted + */ + public abstract void accept(CommandVisitor<T> visitor); + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/cca1f199/src/main/java/org/apache/commons/text/diff/EditScript.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/diff/EditScript.java b/src/main/java/org/apache/commons/text/diff/EditScript.java new file mode 100644 index 0000000..ab7b04e --- /dev/null +++ b/src/main/java/org/apache/commons/text/diff/EditScript.java @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.diff; + +import java.util.ArrayList; +import java.util.List; + +/** + * This class gathers all the {@link EditCommand commands} needed to transform + * one objects sequence into another objects sequence. + * <p> + * An edit script is the most general view of the differences between two + * sequences. It is built as the result of the comparison between two sequences + * by the {@link SequencesComparator SequencesComparator} class. The user can + * walk through it using the <em>visitor</em> design pattern. + * <p> + * It is guaranteed that the objects embedded in the {@link InsertCommand insert + * commands} come from the second sequence and that the objects embedded in + * either the {@link DeleteCommand delete commands} or {@link KeepCommand keep + * commands} come from the first sequence. This can be important if subclassing + * is used for some elements in the first sequence and the <code>equals</code> + * method is specialized. + * + * @see SequencesComparator + * @see EditCommand + * @see CommandVisitor + * @see ReplacementsHandler + * + * @since 4.0 + * @version $Id: EditScript.java 1477760 2013-04-30 18:34:03Z tn $ + */ +public class EditScript<T> { + + /** Container for the commands. */ + private final List<EditCommand<T>> commands; + + /** Length of the longest common subsequence. */ + private int lcsLength; + + /** Number of modifications. */ + private int modifications; + + /** + * Simple constructor. Creates a new empty script. + */ + public EditScript() { + commands = new ArrayList<EditCommand<T>>(); + lcsLength = 0; + modifications = 0; + } + + /** + * Add a keep command to the script. + * + * @param command command to add + */ + public void append(final KeepCommand<T> command) { + commands.add(command); + ++lcsLength; + } + + /** + * Add an insert command to the script. + * + * @param command command to add + */ + public void append(final InsertCommand<T> command) { + commands.add(command); + ++modifications; + } + + /** + * Add a delete command to the script. + * + * @param command command to add + */ + public void append(final DeleteCommand<T> command) { + commands.add(command); + ++modifications; + } + + /** + * Visit the script. The script implements the <em>visitor</em> design + * pattern, this method is the entry point to which the user supplies its + * own visitor, the script will be responsible to drive it through the + * commands in order and call the appropriate method as each command is + * encountered. + * + * @param visitor the visitor that will visit all commands in turn + */ + public void visit(final CommandVisitor<T> visitor) { + for (final EditCommand<T> command : commands) { + command.accept(visitor); + } + } + + /** + * Get the length of the Longest Common Subsequence (LCS). The length of the + * longest common subsequence is the number of {@link KeepCommand keep + * commands} in the script. + * + * @return length of the Longest Common Subsequence + */ + public int getLCSLength() { + return lcsLength; + } + + /** + * Get the number of effective modifications. The number of effective + * modification is the number of {@link DeleteCommand delete} and + * {@link InsertCommand insert} commands in the script. + * + * @return number of effective modifications + */ + public int getModifications() { + return modifications; + } + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/cca1f199/src/main/java/org/apache/commons/text/diff/InsertCommand.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/diff/InsertCommand.java b/src/main/java/org/apache/commons/text/diff/InsertCommand.java new file mode 100644 index 0000000..9ec9c6a --- /dev/null +++ b/src/main/java/org/apache/commons/text/diff/InsertCommand.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.diff; + +/** + * Command representing the insertion of one object of the second sequence. + * <p> + * When one object of the second sequence has no corresponding object in the + * first sequence at the right place, the {@link EditScript edit script} + * transforming the first sequence into the second sequence uses an instance of + * this class to represent the insertion of this object. The objects embedded in + * these type of commands always come from the second sequence. + * + * @see SequencesComparator + * @see EditScript + * + * @since 4.0 + * @version $Id: InsertCommand.java 1477760 2013-04-30 18:34:03Z tn $ + */ +public class InsertCommand<T> extends EditCommand<T> { + + /** + * Simple constructor. Creates a new instance of InsertCommand + * + * @param object the object of the second sequence that should be inserted + */ + public InsertCommand(final T object) { + super(object); + } + + /** + * Accept a visitor. When an <code>InsertCommand</code> accepts a visitor, + * it calls its {@link CommandVisitor#visitInsertCommand visitInsertCommand} + * method. + * + * @param visitor the visitor to be accepted + */ + @Override + public void accept(final CommandVisitor<T> visitor) { + visitor.visitInsertCommand(getObject()); + } + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/cca1f199/src/main/java/org/apache/commons/text/diff/KeepCommand.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/diff/KeepCommand.java b/src/main/java/org/apache/commons/text/diff/KeepCommand.java new file mode 100644 index 0000000..a511bfe --- /dev/null +++ b/src/main/java/org/apache/commons/text/diff/KeepCommand.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.diff; + +/** + * Command representing the keeping of one object present in both sequences. + * <p> + * When one object of the first sequence <code>equals</code> another objects in + * the second sequence at the right place, the {@link EditScript edit script} + * transforming the first sequence into the second sequence uses an instance of + * this class to represent the keeping of this object. The objects embedded in + * these type of commands always come from the first sequence. + * + * @see SequencesComparator + * @see EditScript + * + * @since 4.0 + * @version $Id: KeepCommand.java 1477760 2013-04-30 18:34:03Z tn $ + */ +public class KeepCommand<T> extends EditCommand<T> { + + /** + * Simple constructor. Creates a new instance of KeepCommand + * + * @param object the object belonging to both sequences (the object is a + * reference to the instance in the first sequence which is known + * to be equal to an instance in the second sequence) + */ + public KeepCommand(final T object) { + super(object); + } + + /** + * Accept a visitor. When a <code>KeepCommand</code> accepts a visitor, it + * calls its {@link CommandVisitor#visitKeepCommand visitKeepCommand} method. + * + * @param visitor the visitor to be accepted + */ + @Override + public void accept(final CommandVisitor<T> visitor) { + visitor.visitKeepCommand(getObject()); + } +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/cca1f199/src/main/java/org/apache/commons/text/diff/ReplacementsFinder.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/diff/ReplacementsFinder.java b/src/main/java/org/apache/commons/text/diff/ReplacementsFinder.java new file mode 100644 index 0000000..361a26d --- /dev/null +++ b/src/main/java/org/apache/commons/text/diff/ReplacementsFinder.java @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.diff; + +import java.util.ArrayList; +import java.util.List; + +/** + * This class handles sequences of replacements resulting from a comparison. + * <p> + * The comparison of two objects sequences leads to the identification of common + * parts and parts which only belong to the first or to the second sequence. The + * common parts appear in the edit script in the form of <em>keep</em> commands, + * they can be considered as synchronization objects between the two sequences. + * These synchronization objects split the two sequences in synchronized + * sub-sequences. The first sequence can be transformed into the second one by + * replacing each synchronized sub-sequence of the first sequence by the + * corresponding sub-sequence of the second sequence. This is a synthetic way to + * see an {@link EditScript edit script}, replacing individual + * {@link DeleteCommand delete}, {@link KeepCommand keep} and + * {@link InsertCommand insert} commands by fewer replacements acting on + * complete sub-sequences. + * <p> + * This class is devoted to perform this interpretation. It visits an + * {@link EditScript edit script} (because it implements the + * {@link CommandVisitor CommandVisitor} interface) and calls a user-supplied + * handler implementing the {@link ReplacementsHandler ReplacementsHandler} + * interface to process the sub-sequences. + * + * @see ReplacementsHandler + * @see EditScript + * @see SequencesComparator + * + * @since 4.0 + * @version $Id: ReplacementsFinder.java 1477760 2013-04-30 18:34:03Z tn $ + */ +public class ReplacementsFinder<T> implements CommandVisitor<T> { + + private final List<T> pendingInsertions; + private final List<T> pendingDeletions; + private int skipped; + + /** Handler to call when synchronized sequences are found. */ + private final ReplacementsHandler<T> handler; + + /** + * Simple constructor. Creates a new instance of {@link ReplacementsFinder}. + * + * @param handler handler to call when synchronized sequences are found + */ + public ReplacementsFinder(final ReplacementsHandler<T> handler) { + pendingInsertions = new ArrayList<T>(); + pendingDeletions = new ArrayList<T>(); + skipped = 0; + this.handler = handler; + } + + /** + * Add an object to the pending insertions set. + * + * @param object object to insert + */ + public void visitInsertCommand(final T object) { + pendingInsertions.add(object); + } + + /** + * Handle a synchronization object. + * <p> + * When a synchronization object is identified, the pending insertions and + * pending deletions sets are provided to the user handler as subsequences. + * + * @param object synchronization object detected + */ + public void visitKeepCommand(final T object) { + if (pendingDeletions.isEmpty() && pendingInsertions.isEmpty()) { + ++skipped; + } else { + handler.handleReplacement(skipped, pendingDeletions, pendingInsertions); + pendingDeletions.clear(); + pendingInsertions.clear(); + skipped = 1; + } + } + + /** + * Add an object to the pending deletions set. + * + * @param object object to delete + */ + public void visitDeleteCommand(final T object) { + pendingDeletions.add(object); + } + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/cca1f199/src/main/java/org/apache/commons/text/diff/ReplacementsHandler.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/diff/ReplacementsHandler.java b/src/main/java/org/apache/commons/text/diff/ReplacementsHandler.java new file mode 100644 index 0000000..fe99a03 --- /dev/null +++ b/src/main/java/org/apache/commons/text/diff/ReplacementsHandler.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.diff; + +import java.util.List; + +/** + * This interface is devoted to handle synchronized replacement sequences. + * + * @see ReplacementsFinder + * @since 4.0 + * @version $Id: ReplacementsHandler.java 1543277 2013-11-19 00:53:50Z ggregory $ + */ +public interface ReplacementsHandler<T> { + + /** + * Handle two synchronized sequences. + * <p> + * This method is called by a {@link ReplacementsFinder ReplacementsFinder} + * instance when it has synchronized two sub-sequences of object arrays + * being compared, and at least one of the sequences is non-empty. Since the + * sequences are synchronized, the objects before the two sub-sequences are + * equals (if they exist). This property also holds for the objects after + * the two sub-sequences. + * <p> + * The replacement is defined as replacing the <code>from</code> + * sub-sequence into the <code>to</code> sub-sequence. + * + * @param skipped number of tokens skipped since the last call (i.e. number of + * tokens that were in both sequences), this number should be strictly positive + * except on the very first call where it can be zero (if the first object of + * the two sequences are different) + * @param from sub-sequence of objects coming from the first sequence + * @param to sub-sequence of objects coming from the second sequence + */ + void handleReplacement(int skipped, List<T> from, List<T> to); + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/cca1f199/src/main/java/org/apache/commons/text/diff/StringsComparator.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/diff/StringsComparator.java b/src/main/java/org/apache/commons/text/diff/StringsComparator.java new file mode 100644 index 0000000..c59f89f --- /dev/null +++ b/src/main/java/org/apache/commons/text/diff/StringsComparator.java @@ -0,0 +1,281 @@ +package org.apache.commons.text.diff; + + + +public class StringsComparator { + + private final String left; + private final String right; + + /** Temporary variables. */ + private final int[] vDown; + private final int[] vUp; + + public StringsComparator(String left, String right) { + this.left = left; + this.right = right; + + final int size = left.length() + right.length() + 2; + vDown = new int[size]; + vUp = new int[size]; + } + + /** + * Get the {@link EditScript} object. + * <p> + * It is guaranteed that the objects embedded in the {@link InsertCommand + * insert commands} come from the second sequence and that the objects + * embedded in either the {@link DeleteCommand delete commands} or + * {@link KeepCommand keep commands} come from the first sequence. This can + * be important if subclassing is used for some elements in the first + * sequence and the <code>equals</code> method is specialized. + * + * @return the edit script resulting from the comparison of the two + * sequences + */ + public EditScript getScript() { + final EditScript script = new EditScript(); + buildScript(0, left.length(), 0, right.length(), script); + return script; + } + + /** + * Build an edit script. + * + * @param start1 the begin of the first sequence to be compared + * @param end1 the end of the first sequence to be compared + * @param start2 the begin of the second sequence to be compared + * @param end2 the end of the second sequence to be compared + * @param script the edited script + */ + private void buildScript(final int start1, final int end1, final int start2, final int end2, + final EditScript script) { + final Snake middle = getMiddleSnake(start1, end1, start2, end2); + + if (middle == null + || middle.getStart() == end1 && middle.getDiag() == end1 - end2 + || middle.getEnd() == start1 && middle.getDiag() == start1 - start2) { + + int i = start1; + int j = start2; + while (i < end1 || j < end2) { + if (i < end1 && j < end2 && left.charAt(i) == right.charAt(j)) { + script.append(new KeepCommand(left.charAt(i))); + ++i; + ++j; + } else { + if (end1 - start1 > end2 - start2) { + script.append(new DeleteCommand(left.charAt(i))); + ++i; + } else { + script.append(new InsertCommand(right.charAt(j))); + ++j; + } + } + } + + } else { + + buildScript(start1, middle.getStart(), + start2, middle.getStart() - middle.getDiag(), + script); + for (int i = middle.getStart(); i < middle.getEnd(); ++i) { + script.append(new KeepCommand(left.charAt(i))); + } + buildScript(middle.getEnd(), end1, + middle.getEnd() - middle.getDiag(), end2, + script); + } + } + + private Snake getMiddleSnake(int start1, int end1, int start2, int end2) { + // Myers Algorithm + // Initialisations + final int m = end1 - start1; + final int n = end2 - start2; + if (m == 0 || n == 0) { + return null; + } + + final int delta = m - n; + final int sum = n + m; + final int offset = (sum % 2 == 0 ? sum : sum + 1) / 2; + vDown[1+offset] = start1; + vUp[1+offset] = end1 + 1; + + for (int d = 0; d <= offset ; ++d) { + // Down + for (int k = -d; k <= d; k += 2) { + // First step + + final int i = k + offset; + if (k == -d || k != d && vDown[i-1] < vDown[i+1]) { + vDown[i] = vDown[i+1]; + } else { + vDown[i] = vDown[i-1] + 1; + } + + int x = vDown[i]; + int y = x - start1 + start2 - k; + + while (x < end1 && y < end2 && left.charAt(x) == right.charAt(y)) { + vDown[i] = ++x; + ++y; + } + // Second step + if (delta % 2 != 0 && delta - d <= k && k <= delta + d) { + if (vUp[i-delta] <= vDown[i]) { + return buildSnake(vUp[i-delta], k + start1 - start2, end1, end2); + } + } + } + + // Up + for (int k = delta - d; k <= delta + d; k += 2) { + // First step + final int i = k + offset - delta; + if (k == delta - d + || k != delta + d && vUp[i+1] <= vUp[i-1]) { + vUp[i] = vUp[i+1] - 1; + } else { + vUp[i] = vUp[i-1]; + } + + int x = vUp[i] - 1; + int y = x - start1 + start2 - k; + while (x >= start1 && y >= start2 + && left.charAt(x) == right.charAt(y)) { + vUp[i] = x--; + y--; + } + // Second step + if (delta % 2 == 0 && -d <= k && k <= d ) { + if (vUp[i] <= vDown[i + delta]) { + return buildSnake(vUp[i], k + start1 - start2, end1, end2); + } + } + } + } + + // this should not happen + throw new RuntimeException("Internal Error"); + } + + /** + * Build a snake. + * + * @param start the value of the start of the snake + * @param diag the value of the diagonal of the snake + * @param end1 the value of the end of the first sequence to be compared + * @param end2 the value of the end of the second sequence to be compared + * @return the snake built + */ + private Snake buildSnake(final int start, final int diag, final int end1, final int end2) { + int end = start; + while (end - diag < end2 + && end < end1 + && left.charAt(end) == right.charAt(end - diag)) { + ++end; + } + return new Snake(start, end, diag); + } + + /** + * This class is a simple placeholder to hold the end part of a path + * under construction in a {@link StringsComparator StringsComparator}. + */ + private static class Snake { + + /** Start index. */ + private final int start; + + /** End index. */ + private final int end; + + /** Diagonal number. */ + private final int diag; + + /** + * Simple constructor. Creates a new instance of Snake with specified indices. + * + * @param start start index of the snake + * @param end end index of the snake + * @param diag diagonal number + */ + public Snake(final int start, final int end, final int diag) { + this.start = start; + this.end = end; + this.diag = diag; + } + + /** + * Get the start index of the snake. + * + * @return start index of the snake + */ + public int getStart() { + return start; + } + + /** + * Get the end index of the snake. + * + * @return end index of the snake + */ + public int getEnd() { + return end; + } + + /** + * Get the diagonal number of the snake. + * + * @return diagonal number of the snake + */ + public int getDiag() { + return diag; + } + } + + public static void main(String[] args) { + StringsComparator sc = new StringsComparator("O Bruno eh um bom rapaz. Ele eh do Brasil.", "O Bruno foi um bom rapaz. Ele eh do Brasil ."); + EditScript es = sc.getScript(); + es.visit(new CommandVisitor() { + + boolean nlAdd = true; + boolean nlRemove = true; + + @Override + public void visitInsertCommand(Object object) { + if (nlAdd) { + System.out.println(); + System.out.print("> "); + nlAdd = false; + } + System.out.print(object); + } + + @Override + public void visitKeepCommand(Object object) { + if (!nlAdd) { + nlAdd = true; + } + if (!nlRemove) { + nlRemove = true; + System.out.println(); + } + System.out.print(object); + } + + @Override + public void visitDeleteCommand(Object object) { + if (nlRemove) { + System.out.println(); + System.out.print("< "); + nlRemove = false; + } + System.out.print(object); + } + }); + } + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/cca1f199/src/main/java/org/apache/commons/text/diff/package-info.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/diff/package-info.java b/src/main/java/org/apache/commons/text/diff/package-info.java new file mode 100644 index 0000000..7a565d4 --- /dev/null +++ b/src/main/java/org/apache/commons/text/diff/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * <p>Provides algorithms for diff between strings.</p> + * + * @since 1.0 + */ +package org.apache.commons.text.diff;