1 /* 2 * Copyright 2020 The Netty Project 3 * 4 * The Netty Project licenses this file to you under the Apache License, version 2.0 (the 5 * "License"); you may not use this file except in compliance with the License. You may obtain a 6 * copy of the License at: 7 * 8 * https://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software distributed under the License 11 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 12 * or implied. See the License for the specific language governing permissions and limitations under 13 * the License. 14 */ 15 package io.netty.buffer.search; 16 17 /** 18 * Base class for precomputed factories that create {@link MultiSearchProcessor}s. 19 * <br> 20 * The purpose of {@link MultiSearchProcessor} is to perform efficient simultaneous search for multiple {@code needles} 21 * in the {@code haystack}, while scanning every byte of the input sequentially, only once. While it can also be used 22 * to search for just a single {@code needle}, using a {@link SearchProcessorFactory} would be more efficient for 23 * doing that. 24 * <br> 25 * See the documentation of {@link AbstractSearchProcessorFactory} for a comprehensive description of common usage. 26 * In addition to the functionality provided by {@link SearchProcessor}, {@link MultiSearchProcessor} adds 27 * a method to get the index of the {@code needle} found at the current position of the {@link MultiSearchProcessor} - 28 * {@link MultiSearchProcessor#getFoundNeedleId()}. 29 * <br> 30 * <b>Note:</b> in some cases one {@code needle} can be a suffix of another {@code needle}, eg. {@code {"BC", "ABC"}}, 31 * and there can potentially be multiple {@code needles} found ending at the same position of the {@code haystack}. 32 * In such case {@link MultiSearchProcessor#getFoundNeedleId()} returns the index of the longest matching {@code needle} 33 * in the array of {@code needles}. 34 * <br> 35 * Usage example (given that the {@code haystack} is a {@link io.netty.buffer.ByteBuf} containing "ABCD" and the 36 * {@code needles} are "AB", "BC" and "CD"): 37 * <pre> 38 * MultiSearchProcessorFactory factory = MultiSearchProcessorFactory.newAhoCorasicSearchProcessorFactory( 39 * "AB".getBytes(CharsetUtil.UTF_8), "BC".getBytes(CharsetUtil.UTF_8), "CD".getBytes(CharsetUtil.UTF_8)); 40 * MultiSearchProcessor processor = factory.newSearchProcessor(); 41 * 42 * int idx1 = haystack.forEachByte(processor); 43 * // idx1 is 1 (index of the last character of the occurrence of "AB" in the haystack) 44 * // processor.getFoundNeedleId() is 0 (index of "AB" in needles[]) 45 * 46 * int continueFrom1 = idx1 + 1; 47 * // continue the search starting from the next character 48 * 49 * int idx2 = haystack.forEachByte(continueFrom1, haystack.readableBytes() - continueFrom1, processor); 50 * // idx2 is 2 (index of the last character of the occurrence of "BC" in the haystack) 51 * // processor.getFoundNeedleId() is 1 (index of "BC" in needles[]) 52 * 53 * int continueFrom2 = idx2 + 1; 54 * 55 * int idx3 = haystack.forEachByte(continueFrom2, haystack.readableBytes() - continueFrom2, processor); 56 * // idx3 is 3 (index of the last character of the occurrence of "CD" in the haystack) 57 * // processor.getFoundNeedleId() is 2 (index of "CD" in needles[]) 58 * 59 * int continueFrom3 = idx3 + 1; 60 * 61 * int idx4 = haystack.forEachByte(continueFrom3, haystack.readableBytes() - continueFrom3, processor); 62 * // idx4 is -1 (no more occurrences of any of the needles) 63 * 64 * // This search session is complete, processor should be discarded. 65 * // To search for the same needles again, reuse the same {@link AbstractMultiSearchProcessorFactory} 66 * // to get a new MultiSearchProcessor. 67 * </pre> 68 */ 69 public abstract class AbstractMultiSearchProcessorFactory implements MultiSearchProcessorFactory { 70 71 /** 72 * Creates a {@link MultiSearchProcessorFactory} based on 73 * <a href="https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm">Aho–Corasick</a> 74 * string search algorithm. 75 * <br> 76 * Precomputation (this method) time is linear in the size of input ({@code O(Σ|needles|)}). 77 * <br> 78 * The factory allocates and retains an array of 256 * X ints plus another array of X ints, where X 79 * is the sum of lengths of each entry of {@code needles} minus the sum of lengths of repeated 80 * prefixes of the {@code needles}. 81 * <br> 82 * Search (the actual application of {@link MultiSearchProcessor}) time is linear in the size of 83 * {@link io.netty.buffer.ByteBuf} on which the search is performed ({@code O(|haystack|)}). 84 * Every byte of {@link io.netty.buffer.ByteBuf} is processed only once, sequentually, regardles of 85 * the number of {@code needles} being searched for. 86 * 87 * @param needles a varargs array of arrays of bytes to search for 88 * @return a new instance of {@link AhoCorasicSearchProcessorFactory} precomputed for the given {@code needles} 89 */ 90 public static AhoCorasicSearchProcessorFactory newAhoCorasicSearchProcessorFactory(byte[] ...needles) { 91 return new AhoCorasicSearchProcessorFactory(needles); 92 } 93 94 }