-
Notifications
You must be signed in to change notification settings - Fork 201
Add cluster command for PPL #5265
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
ritvibhatt
wants to merge
16
commits into
opensearch-project:main
Choose a base branch
from
ritvibhatt:ppl-cluster-command
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
16 commits
Select commit
Hold shift + click to select a range
8f2804e
implement cluster command
ritvibhatt b3cc8ce
fix anonymizer
ritvibhatt 152d2d4
fix default values
ritvibhatt 74001d0
fix anonymizer
ritvibhatt 2edff79
fix integ tests
ritvibhatt 082a8e9
fix explain tests
ritvibhatt df7873a
fix doctest
ritvibhatt 5792e98
fix docs and null handling
ritvibhatt e009b93
address concurrency
ritvibhatt 2d99d92
fix cross cluster tests
ritvibhatt 1763fb8
fix integ tests
ritvibhatt 3fbd709
fix formatting
ritvibhatt 5bfaf69
fix tests
ritvibhatt 9563361
fix tests
ritvibhatt ca93d37
fix explain tests
ritvibhatt a88cc9a
fix explain tests
ritvibhatt File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
169 changes: 169 additions & 0 deletions
169
common/src/main/java/org/opensearch/sql/common/cluster/TextSimilarityClustering.java
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,169 @@ | ||
| /* | ||
| * Copyright OpenSearch Contributors | ||
| * SPDX-License-Identifier: Apache-2.0 | ||
| */ | ||
|
|
||
| package org.opensearch.sql.common.cluster; | ||
|
|
||
| import java.util.HashMap; | ||
| import java.util.LinkedHashMap; | ||
| import java.util.Map; | ||
| import org.apache.commons.text.similarity.CosineSimilarity; | ||
|
|
||
| /** | ||
| * Greedy single-pass text similarity clustering for grouping similar text values. Events are | ||
| * processed in order; each is compared to existing cluster representatives using cosine similarity. | ||
| * If the best match meets the threshold, the event joins that cluster; otherwise a new cluster is | ||
| * created. | ||
| * | ||
| * <p>Optimized for incremental processing with vector caching and memory-efficient operations. | ||
| */ | ||
| public class TextSimilarityClustering { | ||
|
|
||
| private static final CosineSimilarity COSINE = new CosineSimilarity(); | ||
|
|
||
| // Cache vectorized representations to avoid recomputation | ||
| private final Map<String, Map<CharSequence, Integer>> vectorCache = | ||
| new LinkedHashMap<>(MAX_CACHE_SIZE, 0.75f, true) { | ||
| @Override | ||
| protected boolean removeEldestEntry(Map.Entry<String, Map<CharSequence, Integer>> eldest) { | ||
| return size() > MAX_CACHE_SIZE; | ||
| } | ||
| }; | ||
| private static final int MAX_CACHE_SIZE = 10000; | ||
|
|
||
| private final double threshold; | ||
| private final String matchMode; | ||
| private final String delims; | ||
|
|
||
| public TextSimilarityClustering(double threshold, String matchMode, String delims) { | ||
| this.threshold = validateThreshold(threshold); | ||
| this.matchMode = validateMatchMode(matchMode); | ||
| this.delims = delims != null ? delims : " "; | ||
| } | ||
|
|
||
| private static double validateThreshold(double threshold) { | ||
| if (threshold <= 0.0 || threshold >= 1.0) { | ||
| throw new IllegalArgumentException( | ||
| "The threshold must be > 0.0 and < 1.0, got: " + threshold); | ||
| } | ||
| return threshold; | ||
| } | ||
|
|
||
| private static String validateMatchMode(String matchMode) { | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ideally this should be enum instead of str, which makes it hard to misuse the interface |
||
| if (matchMode == null) { | ||
| return "termlist"; | ||
| } | ||
| switch (matchMode.toLowerCase()) { | ||
| case "termlist": | ||
| case "termset": | ||
| case "ngramset": | ||
| return matchMode.toLowerCase(); | ||
| default: | ||
| throw new IllegalArgumentException( | ||
| "Invalid match mode: " + matchMode + ". Must be one of: termlist, termset, ngramset"); | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Compute similarity between two text values using the configured match mode. Used for | ||
| * incremental clustering against cluster representatives. | ||
| */ | ||
| public double computeSimilarity(String text1, String text2) { | ||
| // Normalize nulls to empty strings | ||
| String normalizedText1 = (text1 == null) ? "" : text1; | ||
| String normalizedText2 = (text2 == null) ? "" : text2; | ||
|
|
||
| // Both are empty - perfect match | ||
| if (normalizedText1.isEmpty() && normalizedText2.isEmpty()) { | ||
| return 1.0; | ||
| } | ||
|
|
||
| // One is empty, other isn't - no match | ||
| if (normalizedText1.isEmpty() || normalizedText2.isEmpty()) { | ||
| return 0.0; | ||
| } | ||
|
|
||
| // Both non-empty - compute cosine similarity | ||
| Map<CharSequence, Integer> vector1 = vectorizeWithCache(normalizedText1); | ||
| Map<CharSequence, Integer> vector2 = vectorizeWithCache(normalizedText2); | ||
|
|
||
| return COSINE.cosineSimilarity(vector1, vector2); | ||
| } | ||
|
|
||
| private Map<CharSequence, Integer> vectorizeWithCache(String value) { | ||
| return vectorCache.computeIfAbsent(value, this::vectorize); | ||
| } | ||
|
|
||
| private Map<CharSequence, Integer> vectorize(String value) { | ||
| if (value == null || value.isEmpty()) { | ||
| return Map.of(); | ||
| } | ||
| return switch (matchMode) { | ||
| case "termset" -> vectorizeTermSet(value); | ||
| case "ngramset" -> vectorizeNgramSet(value); | ||
| default -> vectorizeTermList(value); | ||
| }; | ||
| } | ||
|
|
||
| private static final java.util.regex.Pattern NUMERIC_PATTERN = | ||
| java.util.regex.Pattern.compile("^\\d+$"); | ||
|
|
||
| private static String normalizeToken(String token) { | ||
| return NUMERIC_PATTERN.matcher(token).matches() ? "*" : token; | ||
| } | ||
|
|
||
| /** Positional term frequency — token order matters. */ | ||
| private Map<CharSequence, Integer> vectorizeTermList(String value) { | ||
| String[] tokens = tokenize(value); | ||
| Map<CharSequence, Integer> vector = new HashMap<>((int) (tokens.length * 1.4)); | ||
|
|
||
| for (int i = 0; i < tokens.length; i++) { | ||
| if (!tokens[i].isEmpty()) { | ||
| String key = i + "-" + normalizeToken(tokens[i]); | ||
| vector.merge(key, 1, Integer::sum); | ||
| } | ||
| } | ||
| return vector; | ||
| } | ||
|
|
||
| /** Bag-of-words term frequency — token order ignored. */ | ||
| private Map<CharSequence, Integer> vectorizeTermSet(String value) { | ||
| String[] tokens = tokenize(value); | ||
| Map<CharSequence, Integer> vector = new HashMap<>((int) (tokens.length * 1.4)); | ||
|
|
||
| for (String token : tokens) { | ||
| if (!token.isEmpty()) { | ||
| vector.merge(normalizeToken(token), 1, Integer::sum); | ||
| } | ||
| } | ||
| return vector; | ||
| } | ||
|
|
||
| /** Character trigram frequency. */ | ||
| private Map<CharSequence, Integer> vectorizeNgramSet(String value) { | ||
| if (value.length() < 3) { | ||
| // For very short strings, fall back to character frequency | ||
| Map<CharSequence, Integer> vector = new HashMap<>(); | ||
| for (char c : value.toCharArray()) { | ||
| vector.merge(String.valueOf(c), 1, Integer::sum); | ||
| } | ||
| return vector; | ||
| } | ||
|
|
||
| Map<CharSequence, Integer> vector = new HashMap<>((int) ((value.length() - 2) * 1.4)); | ||
| for (int i = 0; i <= value.length() - 3; i++) { | ||
| String ngram = value.substring(i, i + 3); | ||
| vector.merge(ngram, 1, Integer::sum); | ||
| } | ||
| return vector; | ||
| } | ||
|
|
||
| private String[] tokenize(String value) { | ||
| if ("non-alphanumeric".equals(delims)) { | ||
| return value.split("[^a-zA-Z0-9_]+"); | ||
| } | ||
| String pattern = "[" + java.util.regex.Pattern.quote(delims) + "]+"; | ||
| return value.split(pattern); | ||
| } | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
53 changes: 53 additions & 0 deletions
53
core/src/main/java/org/opensearch/sql/ast/tree/Cluster.java
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,53 @@ | ||
| /* | ||
| * Copyright OpenSearch Contributors | ||
| * SPDX-License-Identifier: Apache-2.0 | ||
| */ | ||
|
|
||
| package org.opensearch.sql.ast.tree; | ||
|
|
||
| import com.google.common.collect.ImmutableList; | ||
| import java.util.List; | ||
| import lombok.AllArgsConstructor; | ||
| import lombok.EqualsAndHashCode; | ||
| import lombok.Getter; | ||
| import lombok.RequiredArgsConstructor; | ||
| import lombok.Setter; | ||
| import lombok.ToString; | ||
| import org.opensearch.sql.ast.AbstractNodeVisitor; | ||
| import org.opensearch.sql.ast.expression.UnresolvedExpression; | ||
|
|
||
| /** AST node for the PPL cluster command. */ | ||
| @Getter | ||
| @Setter | ||
| @ToString | ||
| @EqualsAndHashCode(callSuper = false) | ||
| @RequiredArgsConstructor | ||
| @AllArgsConstructor | ||
| public class Cluster extends UnresolvedPlan { | ||
|
|
||
| private final UnresolvedExpression sourceField; | ||
| private final double threshold; | ||
| private final String matchMode; | ||
| private final String labelField; | ||
| private final String countField; | ||
| private final boolean labelOnly; | ||
| private final boolean showCount; | ||
| private final String delims; | ||
| private UnresolvedPlan child; | ||
|
|
||
| @Override | ||
| public Cluster attach(UnresolvedPlan child) { | ||
| this.child = child; | ||
| return this; | ||
| } | ||
|
|
||
| @Override | ||
| public List<UnresolvedPlan> getChild() { | ||
| return this.child == null ? ImmutableList.of() : ImmutableList.of(this.child); | ||
| } | ||
|
|
||
| @Override | ||
| public <T, C> T accept(AbstractNodeVisitor<T, C> nodeVisitor, C context) { | ||
| return nodeVisitor.visitCluster(this, context); | ||
| } | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
initialCapacity equal to MAX_CACHE_SIZE?