aspnetcore/src/Microsoft.AspNetCore.Rewrite/Internal/ModRewrite/Tokenizer.cs

// Copyright (c) .NET Foundation. All rights reserved.
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.

using System;
using System.Collections.Generic;
using System.Text.RegularExpressions;

namespace Microsoft.AspNetCore.Rewrite.Internal.ModRewrite
{
    /// <summary>
    /// Tokenizes a mod_rewrite rule, delimited by spaces.
    /// </summary>
    public class Tokenizer
    {
        private const char Space = ' ';
        private const char Escape = '\\';
        private const char Tab = '\t';
        private const char Quote = '"';

        /// <summary>
        /// Splits a string on whitespace, ignoring spaces, creating into a list of strings.
        /// </summary>
        /// <param name="rule">The rule to tokenize.</param>
        /// <returns>A list of tokens.</returns>
        public IList<string> Tokenize(string rule)
        {
            // TODO make list of strings a reference to the original rule? (run into problems with escaped spaces).
            // TODO handle "s and probably replace \ character with no slash.
            if (string.IsNullOrEmpty(rule))
            {
                return null;
            }
            var context = new ParserContext(rule);
            context.Next();

            var tokens = new List<string>();
            context.Mark();
            while (true)
            {
                switch (context.Current)
                {
                    case Escape:
                        // Need to progress such that the next character is not evaluated.
                        if (!context.Next())
                        {
                            // Means that a character was not escaped appropriately Ex: "foo\"
                            throw new FormatException($"Invalid escaper character in string: {rule}");
                        }
                        break;
                    case Quote:
                        // Ignore all characters until the next quote is hit
                        if (!context.Next())
                        {
                            throw new FormatException($"Mismatched number of quotes: {rule}");
                        }

                        while (context.Current != Quote)
                        {
                            if (!context.Next())
                            {
                                throw new FormatException($"Mismatched number of quotes: {rule}");
                            }
                        }
                        break;
                    case Space:
                    case Tab:
                        // time to capture!
                        var token = context.Capture();
                        if (!string.IsNullOrEmpty(token))
                        {
                            tokens.Add(token);
                            do
                            {
                                if (!context.Next())
                                {
                                    // At end of string, we can return at this point.
                                    RemoveQuotesAndEscapeCharacters(tokens);
                                    return tokens;
                                }
                            } while (context.Current == Space || context.Current == Tab);
                            context.Mark();
                            context.Back();
                        }
                        break;
                }
                if (!context.Next())
                {
                    // End of string. Capture.
                    break;
                }
            }
            var done = context.Capture();
            if (!string.IsNullOrEmpty(done))
            {
                tokens.Add(done);
            }

            RemoveQuotesAndEscapeCharacters(tokens);
            return tokens;
        }

        // Need to remove leading and trailing slashes if they exist.
        // This is on start-up, so more forgivening towards substrings/ new strings
        // If this is a perf/memory problem, discuss later.
        private static void RemoveQuotesAndEscapeCharacters(IList<string> tokens)
        {
            for (var i = 0; i < tokens.Count; i++)
            {
                var token = tokens[i];
                var trimmed = token.Trim('\"');
                tokens[i] = Regex.Unescape(trimmed);
            }
        }
    }
}