StringUtil.java

// Copyright (c) ZeroC, Inc.

package com.zeroc.Ice;

import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;

/**
 * @hidden Public because it's used by IceGridGUI.
 */
public final class StringUtil {
    /**
     * Returns the index of the first character in str to appear in match, starting from 0.
     *
     * @param str the string to search in
     * @param match the characters to search for
     * @return the index of the first matching character, or -1 if none is found
     */
    public static int findFirstOf(String str, String match) {
        return findFirstOf(str, match, 0);
    }

    /**
     * Returns the index of the first character in str to appear in match, starting from start.
     *
     * @param str the string to search in
     * @param match the characters to search for
     * @param start the starting index
     * @return the index of the first matching character, or -1 if none is found
     */
    public static int findFirstOf(String str, String match, int start) {
        final int len = str.length();
        for (int i = start; i < len; i++) {
            char ch = str.charAt(i);
            if (match.indexOf(ch) != -1) {
                return i;
            }
        }

        return -1;
    }

    /**
     * Returns the index of the first character in str which does not appear in match, starting from start.
     *
     * @param str the string to search in
     * @param match the characters to exclude
     * @param start the starting index
     * @return the index of the first non-matching character, or -1 if none is found
     */
    public static int findFirstNotOf(String str, String match, int start) {
        final int len = str.length();
        for (int i = start; i < len; i++) {
            char ch = str.charAt(i);
            if (match.indexOf(ch) == -1) {
                return i;
            }
        }

        return -1;
    }

    private static void encodeChar(char c, StringBuilder sb, String special, ToStringMode toStringMode) {
        switch (c) {
            case '\\' -> sb.append("\\\\");
            case '\'' -> sb.append("\\'");
            case '"' -> sb.append("\\\"");
            case '\007' -> {
                if (toStringMode == ToStringMode.Compat) {
                    // Octal escape for compatibility with 3.6 and earlier
                    sb.append("\\007");
                } else {
                    sb.append("\\a");
                }
            }
            case '\b' -> sb.append("\\b");
            case '\f' -> sb.append("\\f");
            case '\n' -> sb.append("\\n");
            case '\r' -> sb.append("\\r");
            case '\t' -> sb.append("\\t");
            case '\013' -> {
                if (toStringMode == ToStringMode.Compat) {
                    // Octal escape for compatibility with 3.6 and earlier
                    sb.append("\\013");
                } else {
                    sb.append("\\v");
                }
            }
            default -> {
                if (special != null && special.indexOf(c) != -1) {
                    sb.append('\\');
                    sb.append(c);
                } else {
                    if (c < 32 || c > 126) {
                        if (toStringMode == ToStringMode.Compat) {
                            // When ToStringMode=Compat, c is a UTF-8 byte
                            assert (c < 256);

                            sb.append('\\');
                            String octal = Integer.toOctalString(c);

                            // Add leading zeroes so that we avoid problems during
                            // decoding. For example, consider the encoded string
                            // \0013 (i.e., a character with value 1 followed by
                            // the character '3'). If the leading zeroes were omitted,
                            // the result would be incorrectly interpreted by the
                            // decoder as a single character with value 11.
                            for (int j = octal.length(); j < 3; j++) {
                                sb.append('0');
                            }
                            sb.append(octal);
                        } else if (c < 32 || c == 127 || toStringMode == ToStringMode.ASCII) {
                            // append \\unnnn
                            sb.append("\\u");
                            String hex = Integer.toHexString(c);
                            for (int j = hex.length(); j < 4; j++) {
                                sb.append('0');
                            }
                            sb.append(hex);
                        } else {
                            // keep as is
                            sb.append(c);
                        }
                    } else {
                        // printable ASCII character
                        sb.append(c);
                    }
                }
            }
        }
    }

    /**
     * Adds escape sequences (like "\n") to the input string.
     *
     * @param s the string to escape
     * @param special additional characters to escape, or null
     * @param toStringMode the string mode for escaping
     * @return the escaped string
     */
    public static String escapeString(String s, String special, ToStringMode toStringMode) {
        if (special != null) {
            for (int i = 0; i < special.length(); i++) {
                if (special.charAt(i) < 32 || special.charAt(i) > 126) {
                    throw new IllegalArgumentException(
                        "special characters must be in ASCII range 32-126");
                }
            }
        }

        if (toStringMode == ToStringMode.Compat) {
            // Encode UTF-8 bytes

            byte[] bytes = null;
            try {
                bytes = s.getBytes("UTF8");
            } catch (UnsupportedEncodingException ex) {
                assert false;
                return null;
            }

            StringBuilder result = new StringBuilder(bytes.length);
            for (int i = 0; i < bytes.length; i++) {
                encodeChar((char) (bytes[i] & 0xFF), result, special, toStringMode);
            }

            return result.toString();
        } else {
            StringBuilder result = new StringBuilder(s.length());

            for (int i = 0; i < s.length(); i++) {
                char c = s.charAt(i);
                if (toStringMode == ToStringMode.Unicode || !Character.isSurrogate(c)) {
                    encodeChar(c, result, special, toStringMode);
                } else {
                    assert (toStringMode == ToStringMode.ASCII && Character.isSurrogate(c));
                    if (i + 1 == s.length()) {
                        throw new IllegalArgumentException("High surrogate without low surrogate");
                    } else {
                        i++;
                        int codePoint = Character.toCodePoint(c, s.charAt(i));
                        // append \Unnnnnnnn
                        result.append("\\U");
                        String hex = Integer.toHexString(codePoint);
                        for (int j = hex.length(); j < 8; j++) {
                            result.append('0');
                        }
                        result.append(hex);
                    }
                }
            }

            return result.toString();
        }
    }

    private static char checkChar(String s, int pos) {
        char c = s.charAt(pos);
        if (c < 32 || c == 127) {
            String msg;
            if (pos > 0) {
                msg = "character after `" + s.substring(0, pos) + "'";
            } else {
                msg = "first character";
            }
            msg += " has invalid ordinal value " + (int) c;
            throw new IllegalArgumentException(msg);
        }
        return c;
    }

    /**
     * Decode the character or escape sequence starting at start and appends it to result.
     *
     * @return the index of the first character following the decoded character or escape sequence.
     */
    private static int decodeChar(String s, int start, int end, String special, StringBuilder result) {
        assert (start >= 0);
        assert (start < end);
        assert (end <= s.length());

        if (s.charAt(start) != '\\') {
            result.append(checkChar(s, start++));
        } else if (start + 1 == end) {
            ++start;
            result.append('\\');
        } else {
            char c = s.charAt(++start);

            switch (c) {
                case '\\', '\'', '"', '?' -> {
                    ++start;
                    result.append(c);
                }
                case 'a' -> {
                    ++start;
                    result.append('\u0007');
                }
                case 'b' -> {
                    ++start;
                    result.append('\b');
                }
                case 'f' -> {
                    ++start;
                    result.append('\f');
                }
                case 'n' -> {
                    ++start;
                    result.append('\n');
                }
                case 'r' -> {
                    ++start;
                    result.append('\r');
                }
                case 't' -> {
                    ++start;
                    result.append('\t');
                }
                case 'v' -> {
                    ++start;
                    result.append('\u000b');
                }

                case 'u', 'U' -> {
                    int codePoint = 0;
                    boolean inBMP = c == 'u';
                    int size = inBMP ? 4 : 8;
                    ++start;
                    while (size > 0 && start < end) {
                        c = s.charAt(start++);
                        int charVal = 0;
                        if (c >= '0' && c <= '9') {
                            charVal = c - '0';
                        } else if (c >= 'a' && c <= 'f') {
                            charVal = 10 + (c - 'a');
                        } else if (c >= 'A' && c <= 'F') {
                            charVal = 10 + (c - 'A');
                        } else {
                            break; // while
                        }
                        codePoint = codePoint * 16 + charVal;
                        --size;
                    }
                    if (size > 0) {
                        throw new IllegalArgumentException("Invalid universal character name: too few hex digits");
                    }
                    if (codePoint >= 0xD800 && codePoint <= 0xDFFF) {
                        throw new IllegalArgumentException("A universal character name cannot designate a surrogate");
                    }
                    if (inBMP || Character.isBmpCodePoint(codePoint)) {
                        result.append((char) codePoint);
                    } else {
                        result.append(Character.toChars(codePoint));
                    }
                }

                // UTF-8 byte sequence encoded with octal or hex escapes
                case '0', '1', '2', '3', '4', '5', '6', '7', 'x' -> {
                    byte[] arr = new byte[end - start];
                    int i = 0;
                    boolean more = true;
                    while (more) {
                        int val = 0;
                        if (c == 'x') {
                            int size = 2;
                            ++start;
                            while (size > 0 && start < end) {
                                c = s.charAt(start++);
                                int charVal = 0;
                                if (c >= '0' && c <= '9') {
                                    charVal = c - '0';
                                } else if (c >= 'a' && c <= 'f') {
                                    charVal = 10 + (c - 'a');
                                } else if (c >= 'A' && c <= 'F') {
                                    charVal = 10 + (c - 'A');
                                } else {
                                    --start; // move back
                                    break; // while
                                }
                                val = val * 16 + charVal;
                                --size;
                            }
                            if (size == 2) {
                                throw new IllegalArgumentException("Invalid \\x escape sequence: no hex digit");
                            }
                        } else {
                            for (int j = 0; j < 3 && start < end; j++) {
                                int charVal = s.charAt(start++) - '0';
                                if (charVal < 0 || charVal > 7) {
                                    --start; // move back
                                    assert (j != 0); // must be at least one digit
                                    break; // for
                                }
                                val = val * 8 + charVal;
                            }
                            if (val > 255) {
                                String msg =
                                    "octal value \\" + Integer.toOctalString(val) + " (" + val + ") is out of range";
                                throw new IllegalArgumentException(msg);
                            }
                        }

                        arr[i++] = (byte) val;

                        more = false;

                        if ((start + 1 < end) && s.charAt(start) == '\\') {
                            c = s.charAt(start + 1);
                            if (c == 'x' || (c >= '0' && c <= '9')) {
                                start++;
                                more = true;
                            }
                        }
                    }

                    try {
                        result.append(new String(arr, 0, i, "UTF8"));
                    } catch (UnsupportedEncodingException ex) {
                        throw new IllegalArgumentException("unsupported encoding", ex);
                    }
                }

                default -> {
                    if (special == null || special.isEmpty() || special.indexOf(c) == -1) {
                        result.append('\\'); // not in special, so we keep the backslash
                    }
                    result.append(checkChar(s, start++));
                }
            }
        }

        return start;
    }

    /**
     * Removes escape sequences added by escapeString.
     *
     * @param s the string to unescape
     * @param start the starting index
     * @param end the ending index
     * @param special additional escaped characters to handle, or null
     * @return the unescaped string
     * @throws IllegalArgumentException for an invalid input string
     */
    public static String unescapeString(String s, int start, int end, String special) {
        assert (start >= 0 && start <= end && end <= s.length());

        if (special != null) {
            for (int i = 0; i < special.length(); i++) {
                if (special.charAt(i) < 32 || special.charAt(i) > 126) {
                    throw new IllegalArgumentException("special characters must be in ASCII range 32-126");
                }
            }
        }

        // Optimization for strings without escapes
        int p = s.indexOf('\\', start);
        if (p == -1 || p >= end) {
            p = start;
            while (p < end) {
                checkChar(s, p++);
            }
            return s.substring(start, end);
        } else {
            StringBuilder sb = new StringBuilder(end - start);
            while (start < end) {
                start = decodeChar(s, start, end, special, sb);
            }
            return sb.toString();
        }
    }

    /**
     * Splits a string using the specified delimiters, handling quoted sections.
     *
     * @param str the string to split
     * @param delim the delimiter characters
     * @return an array of split strings, or null for unmatched quotes
     */
    public static String[] splitString(String str, String delim) {
        List<String> l = new ArrayList<>();
        char[] arr = new char[str.length()];
        int pos = 0;

        int n = 0;
        char quoteChar = '\0';
        while (pos < str.length()) {
            if (quoteChar == '\0' && (str.charAt(pos) == '"' || str.charAt(pos) == '\'')) {
                quoteChar = str.charAt(pos++);
                continue; // Skip the quote.
            } else if (quoteChar == '\0'
                && str.charAt(pos) == '\\'
                && pos + 1 < str.length()
                && (str.charAt(pos + 1) == '"' || str.charAt(pos + 1) == '\'')) {
                ++pos; // Skip the backslash
            } else if (quoteChar != '\0'
                && str.charAt(pos) == '\\'
                && pos + 1 < str.length()
                && str.charAt(pos + 1) == quoteChar) {
                ++pos; // Skip the backslash
            } else if (quoteChar != '\0' && str.charAt(pos) == quoteChar) {
                ++pos;
                quoteChar = '\0';
                continue; // Skip the quote.
            } else if (delim.indexOf(str.charAt(pos)) != -1) {
                if (quoteChar == '\0') {
                    ++pos;
                    if (n > 0) {
                        l.add(new String(arr, 0, n));
                        n = 0;
                    }
                    continue;
                }
            }

            if (pos < str.length()) {
                arr[n++] = str.charAt(pos++);
            }
        }

        if (n > 0) {
            l.add(new String(arr, 0, n));
        }
        if (quoteChar != '\0') {
            return null; // Unmatched quote.
        }
        return l.toArray(new String[0]);
    }

    /**
     * Checks for a quotation mark at the start position and finds the matching closing quote.
     *
     * @param s the string to check
     * @param start the starting position
     * @return the position of the matching closing quote, 0 if no quote at start, or -1 if unmatched
     */
    public static int checkQuote(String s, int start) {
        char quoteChar = s.charAt(start);
        if (quoteChar == '"' || quoteChar == '\'') {
            start++;
            final int len = s.length();
            int pos;
            while (start < len && (pos = s.indexOf(quoteChar, start)) != -1) {
                if (s.charAt(pos - 1) != '\\') {
                    return pos;
                }
                start = pos + 1;
            }
            return -1; // Unmatched quote
        }
        return 0; // Not quoted
    }

    /**
     * Matches a string against a pattern that may contain wildcards.
     *
     * @param s the string to match
     * @param pat the pattern with optional wildcards (*)
     * @param emptyMatch whether to allow empty matches for wildcards
     * @return true if the string matches the pattern
     */
    public static boolean match(String s, String pat, boolean emptyMatch) {
        assert (!s.isEmpty());
        assert (!pat.isEmpty());

        // If pattern does not contain a wildcard just compare strings.
        int beginIndex = pat.indexOf('*');
        if (beginIndex < 0) {
            return s.equals(pat);
        }

        // Make sure start of the strings match
        if (beginIndex > s.length() || !s.substring(0, beginIndex).equals(pat.substring(0, beginIndex))) {
            return false;
        }

        // Make sure there is something present in the middle to match the wildcard.
        // If emptyMatch is true, allow a match of "".
        int endLength = pat.length() - beginIndex - 1;
        if (endLength == 0) {
            return true;
        }
        if (endLength > s.length()) {
            return false;
        }
        int endIndex = s.length() - endLength;
        if (endIndex < beginIndex || (!emptyMatch && endIndex == beginIndex)) {
            return false;
        }

        // Make sure end of the strings match
        if (!s.substring(endIndex, s.length() - endIndex)
            .equals(pat.substring(beginIndex + 1, pat.length() - beginIndex - 1))) {
            return false;
        }

        return true;
    }

    private StringUtil() {}
}