From 15649b7e31a1896562a72806119fc439527f94eb Mon Sep 17 00:00:00 2001 From: David Obando Date: Fri, 4 Mar 2016 10:41:27 -0800 Subject: [PATCH] Faster SubMatch implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Submatch has been sped up by implementing a modified Boyer–Moore–Horspool algorithm with an average-case complexity of O(N) on random text. Worst case, it behaves similarly to the previous implementation O(MN), where M is the length of the boundary and N is the length of the buffer to operate on. Method SubMatch looks for two things: 1. Whether the byte array segment fully contains the boundary, or 2. Whether the byte array ends with the start of the boundary. Case 1 is now a lot faster than the previous implementation. Case 2 remains using the same code as before. The method will do Case 1 until the matchOffset is equal to N-M. It then switches to Case 2, unless a match is found. The code can be further sped up with a full Boyer–Moore implementation, or something more sophisticated. This however can be evaluated in the case that this implementation is insufficiently performant for our main scenarios. This commit resolves issue #575. --- .../MultipartBoundary.cs | 72 +++++++++++++++++ .../MultipartReader.cs | 7 +- .../MultipartReaderStream.cs | 79 +++++++++++++------ .../MultipartReaderTests.cs | 60 ++++++++++++++ 4 files changed, 189 insertions(+), 29 deletions(-) create mode 100644 src/Microsoft.AspNetCore.WebUtilities/MultipartBoundary.cs diff --git a/src/Microsoft.AspNetCore.WebUtilities/MultipartBoundary.cs b/src/Microsoft.AspNetCore.WebUtilities/MultipartBoundary.cs new file mode 100644 index 0000000000..0da1303835 --- /dev/null +++ b/src/Microsoft.AspNetCore.WebUtilities/MultipartBoundary.cs @@ -0,0 +1,72 @@ +// Copyright (c) .NET Foundation. All rights reserved. +// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information. + +using System; +using System.Text; + +namespace Microsoft.AspNetCore.WebUtilities +{ + internal class MultipartBoundary + { + private readonly int[] _skipTable = new int[256]; + private readonly string _boundary; + private bool _expectLeadingCrlf; + + public MultipartBoundary(string boundary, bool expectLeadingCrlf = true) + { + if (boundary == null) + { + throw new ArgumentNullException(nameof(boundary)); + } + + _boundary = boundary; + _expectLeadingCrlf = expectLeadingCrlf; + Initialize(_boundary, _expectLeadingCrlf); + } + + private void Initialize(string boundary, bool expectLeadingCrlf) + { + if (expectLeadingCrlf) + { + BoundaryBytes = Encoding.UTF8.GetBytes("\r\n--" + boundary); + } + else + { + BoundaryBytes = Encoding.UTF8.GetBytes("--" + boundary); + } + FinalBoundaryLength = BoundaryBytes.Length + 2; // Include the final '--' terminator. + + var length = BoundaryBytes.Length; + for (var i = 0; i < _skipTable.Length; ++i) + { + _skipTable[i] = length; + } + for (var i = 0; i < length; ++i) + { + _skipTable[BoundaryBytes[i]] = Math.Max(1, length - 1 - i); + } + } + + public int GetSkipValue(byte input) + { + return _skipTable[input]; + } + + public bool ExpectLeadingCrlf + { + get { return _expectLeadingCrlf; } + set + { + if (value != _expectLeadingCrlf) + { + _expectLeadingCrlf = value; + Initialize(_boundary, _expectLeadingCrlf); + } + } + } + + public byte[] BoundaryBytes { get; private set; } + + public int FinalBoundaryLength { get; private set; } + } +} diff --git a/src/Microsoft.AspNetCore.WebUtilities/MultipartReader.cs b/src/Microsoft.AspNetCore.WebUtilities/MultipartReader.cs index 32dbdbcb9d..96b92dd501 100644 --- a/src/Microsoft.AspNetCore.WebUtilities/MultipartReader.cs +++ b/src/Microsoft.AspNetCore.WebUtilities/MultipartReader.cs @@ -17,7 +17,7 @@ namespace Microsoft.AspNetCore.WebUtilities private const int DefaultBufferSize = 1024 * 4; private readonly BufferedReadStream _stream; - private readonly string _boundary; + private readonly MultipartBoundary _boundary; private MultipartReaderStream _currentStream; public MultipartReader(string boundary, Stream stream) @@ -42,9 +42,9 @@ namespace Microsoft.AspNetCore.WebUtilities throw new ArgumentOutOfRangeException(nameof(bufferSize), bufferSize, "Insufficient buffer space, the buffer must be larger than the boundary: " + boundary); } _stream = new BufferedReadStream(stream, bufferSize); - _boundary = boundary; + _boundary = new MultipartBoundary(boundary, false); // This stream will drain any preamble data and remove the first boundary marker. - _currentStream = new MultipartReaderStream(_stream, _boundary, expectLeadingCrlf: false); + _currentStream = new MultipartReaderStream(_stream, _boundary); } /// @@ -69,6 +69,7 @@ namespace Microsoft.AspNetCore.WebUtilities return null; } var headers = await ReadHeadersAsync(cancellationToken); + _boundary.ExpectLeadingCrlf = true; _currentStream = new MultipartReaderStream(_stream, _boundary); long? baseStreamOffset = _stream.CanSeek ? (long?)_stream.Position : null; return new MultipartSection() { Headers = headers, Body = _currentStream, BaseStreamOffset = baseStreamOffset }; diff --git a/src/Microsoft.AspNetCore.WebUtilities/MultipartReaderStream.cs b/src/Microsoft.AspNetCore.WebUtilities/MultipartReaderStream.cs index 442c693eab..fdd84a5c6d 100644 --- a/src/Microsoft.AspNetCore.WebUtilities/MultipartReaderStream.cs +++ b/src/Microsoft.AspNetCore.WebUtilities/MultipartReaderStream.cs @@ -4,7 +4,6 @@ using System; using System.Diagnostics; using System.IO; -using System.Text; using System.Threading; using System.Threading.Tasks; @@ -12,9 +11,8 @@ namespace Microsoft.AspNetCore.WebUtilities { internal class MultipartReaderStream : Stream { + private readonly MultipartBoundary _boundary; private readonly BufferedReadStream _innerStream; - private readonly byte[] _boundaryBytes; - private readonly int _finalBoundaryLength; private readonly long _innerOffset; private long _position; private long _observedLength; @@ -25,8 +23,7 @@ namespace Microsoft.AspNetCore.WebUtilities /// /// The . /// The boundary pattern to use. - /// Specifies whether a leading crlf should be expected. - public MultipartReaderStream(BufferedReadStream stream, string boundary, bool expectLeadingCrlf = true) + public MultipartReaderStream(BufferedReadStream stream, MultipartBoundary boundary) { if (stream == null) { @@ -40,15 +37,7 @@ namespace Microsoft.AspNetCore.WebUtilities _innerStream = stream; _innerOffset = _innerStream.CanSeek ? _innerStream.Position : 0; - if (expectLeadingCrlf) - { - _boundaryBytes = Encoding.UTF8.GetBytes("\r\n--" + boundary); - } - else - { - _boundaryBytes = Encoding.UTF8.GetBytes("--" + boundary); - } - _finalBoundaryLength = _boundaryBytes.Length + 2; // Include the final '--' terminator. + _boundary = boundary; } public bool FinalBoundaryFound { get; private set; } @@ -205,7 +194,7 @@ namespace Microsoft.AspNetCore.WebUtilities } PositionInnerStream(); - if (!_innerStream.EnsureBuffered(_finalBoundaryLength)) + if (!_innerStream.EnsureBuffered(_boundary.FinalBoundaryLength)) { throw new IOException("Unexpected end of stream."); } @@ -215,7 +204,7 @@ namespace Microsoft.AspNetCore.WebUtilities int matchOffset; int matchCount; int read; - if (SubMatch(bufferedData, _boundaryBytes, out matchOffset, out matchCount)) + if (SubMatch(bufferedData, _boundary.BoundaryBytes, out matchOffset, out matchCount)) { // We found a possible match, return any data before it. if (matchOffset > bufferedData.Offset) @@ -223,12 +212,12 @@ namespace Microsoft.AspNetCore.WebUtilities read = _innerStream.Read(buffer, offset, Math.Min(count, matchOffset - bufferedData.Offset)); return UpdatePosition(read); } - Debug.Assert(matchCount == _boundaryBytes.Length); + Debug.Assert(matchCount == _boundary.BoundaryBytes.Length); // "The boundary may be followed by zero or more characters of // linear whitespace. It is then terminated by either another CRLF" // or -- for the final boundary. - byte[] boundary = new byte[_boundaryBytes.Length]; + byte[] boundary = new byte[_boundary.BoundaryBytes.Length]; read = _innerStream.Read(boundary, 0, boundary.Length); Debug.Assert(read == boundary.Length); // It should have all been buffered var remainder = _innerStream.ReadLine(lengthLimit: 100); // Whitespace may exceed the buffer. @@ -256,7 +245,7 @@ namespace Microsoft.AspNetCore.WebUtilities } PositionInnerStream(); - if (!await _innerStream.EnsureBufferedAsync(_finalBoundaryLength, cancellationToken)) + if (!await _innerStream.EnsureBufferedAsync(_boundary.FinalBoundaryLength, cancellationToken)) { throw new IOException("Unexpected end of stream."); } @@ -266,7 +255,7 @@ namespace Microsoft.AspNetCore.WebUtilities int matchOffset; int matchCount; int read; - if (SubMatch(bufferedData, _boundaryBytes, out matchOffset, out matchCount)) + if (SubMatch(bufferedData, _boundary.BoundaryBytes, out matchOffset, out matchCount)) { // We found a possible match, return any data before it. if (matchOffset > bufferedData.Offset) @@ -275,12 +264,12 @@ namespace Microsoft.AspNetCore.WebUtilities read = _innerStream.Read(buffer, offset, Math.Min(count, matchOffset - bufferedData.Offset)); return UpdatePosition(read); } - Debug.Assert(matchCount == _boundaryBytes.Length); + Debug.Assert(matchCount == _boundary.BoundaryBytes.Length); // "The boundary may be followed by zero or more characters of // linear whitespace. It is then terminated by either another CRLF" // or -- for the final boundary. - byte[] boundary = new byte[_boundaryBytes.Length]; + byte[] boundary = new byte[_boundary.BoundaryBytes.Length]; read = _innerStream.Read(boundary, 0, boundary.Length); Debug.Assert(read == boundary.Length); // It should have all been buffered var remainder = await _innerStream.ReadLineAsync(lengthLimit: 100, cancellationToken: cancellationToken); // Whitespace may exceed the buffer. @@ -300,18 +289,44 @@ namespace Microsoft.AspNetCore.WebUtilities return UpdatePosition(read); } - // Does Segment1 contain all of segment2, or does it end with the start of segment2? + // Does segment1 contain all of matchBytes, or does it end with the start of matchBytes? // 1: AAAAABBBBBCCCCC // 2: BBBBB // Or: // 1: AAAAABBB // 2: BBBBB - private static bool SubMatch(ArraySegment segment1, byte[] matchBytes, out int matchOffset, out int matchCount) + private bool SubMatch(ArraySegment segment1, byte[] matchBytes, out int matchOffset, out int matchCount) { + // clear matchCount to zero matchCount = 0; - for (matchOffset = segment1.Offset; matchOffset < segment1.Offset + segment1.Count; matchOffset++) + + // case 1: does segment1 fully contain matchBytes? { - int countLimit = segment1.Offset - matchOffset + segment1.Count; + var matchBytesLengthMinusOne = matchBytes.Length - 1; + var matchBytesLastByte = matchBytes[matchBytesLengthMinusOne]; + var segmentEndMinusMatchBytesLength = segment1.Offset + segment1.Count - matchBytes.Length; + + matchOffset = segment1.Offset; + while (matchOffset < segmentEndMinusMatchBytesLength) + { + var lookaheadTailChar = segment1.Array[matchOffset + matchBytesLengthMinusOne]; + if (lookaheadTailChar == matchBytesLastByte && + CompareBuffers(segment1.Array, matchOffset, matchBytes, 0, matchBytesLengthMinusOne) == 0) + { + matchCount = matchBytes.Length; + return true; + } + matchOffset += _boundary.GetSkipValue(lookaheadTailChar); + } + } + + // case 2: does segment1 end with the start of matchBytes? + var segmentEnd = segment1.Offset + segment1.Count; + + matchCount = 0; + for (; matchOffset < segmentEnd; matchOffset++) + { + var countLimit = segmentEnd - matchOffset; for (matchCount = 0; matchCount < matchBytes.Length && matchCount < countLimit; matchCount++) { if (matchBytes[matchCount] != segment1.Array[matchOffset + matchCount]) @@ -327,5 +342,17 @@ namespace Microsoft.AspNetCore.WebUtilities } return matchCount > 0; } + + private static int CompareBuffers(byte[] buffer1, int offset1, byte[] buffer2, int offset2, int count) + { + for (; count-- > 0; offset1++, offset2++) + { + if (buffer1[offset1] != buffer2[offset2]) + { + return buffer1[offset1] - buffer2[offset2]; + } + } + return 0; + } } } diff --git a/test/Microsoft.AspNetCore.WebUtilities.Tests/MultipartReaderTests.cs b/test/Microsoft.AspNetCore.WebUtilities.Tests/MultipartReaderTests.cs index 295241f2c3..853d75e563 100644 --- a/test/Microsoft.AspNetCore.WebUtilities.Tests/MultipartReaderTests.cs +++ b/test/Microsoft.AspNetCore.WebUtilities.Tests/MultipartReaderTests.cs @@ -1,6 +1,7 @@ // Copyright (c) .NET Foundation. All rights reserved. // Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information. +using System; using System.IO; using System.Text; using System.Threading.Tasks; @@ -74,11 +75,29 @@ namespace Microsoft.AspNetCore.WebUtilities "\r\n" + "--9051914041544843365972754266--\r\n"; + private const string TwoPartBodyIncompleteBuffer = +"--9051914041544843365972754266\r\n" + +"Content-Disposition: form-data; name=\"text\"\r\n" + +"\r\n" + +"text default\r\n" + +"--9051914041544843365972754266\r\n" + +"Content-Disposition: form-data; name=\"file1\"; filename=\"a.txt\"\r\n" + +"Content-Type: text/plain\r\n" + +"\r\n" + +"Content of a.txt.\r\n" + +"\r\n" + +"--9051914041544843365"; + private static MemoryStream MakeStream(string text) { return new MemoryStream(Encoding.UTF8.GetBytes(text)); } + private static string GetString(byte[] buffer, int count) + { + return Encoding.ASCII.GetString(buffer, 0, count); + } + [Fact] public async Task MutipartReader_ReadSinglePartBody_Success() { @@ -217,6 +236,47 @@ namespace Microsoft.AspNetCore.WebUtilities Assert.Null(await reader.ReadNextSectionAsync()); } + [Fact] + public void MutipartReader_BufferSizeMustBeLargerThanBoundary_Throws() + { + var stream = MakeStream(ThreePartBody); + Assert.Throws(() => + { + var reader = new MultipartReader(Boundary, stream, 5); + }); + } + + [Fact] + public async Task MutipartReader_TwoPartBodyIncompleteBuffer_TwoSectionsReadSuccessfullyThirdSectionThrows() + { + var stream = MakeStream(TwoPartBodyIncompleteBuffer); + var reader = new MultipartReader(Boundary, stream); + var buffer = new byte[128]; + + //first section can be read successfully + var section = await reader.ReadNextSectionAsync(); + Assert.NotNull(section); + Assert.Equal(1, section.Headers.Count); + Assert.Equal("form-data; name=\"text\"", section.Headers["Content-Disposition"][0]); + var read = section.Body.Read(buffer, 0, buffer.Length); + Assert.Equal("text default", GetString(buffer, read)); + + //second section can be read successfully (even though the bottom boundary is truncated) + section = await reader.ReadNextSectionAsync(); + Assert.NotNull(section); + Assert.Equal(2, section.Headers.Count); + Assert.Equal("form-data; name=\"file1\"; filename=\"a.txt\"", section.Headers["Content-Disposition"][0]); + Assert.Equal("text/plain", section.Headers["Content-Type"][0]); + read = section.Body.Read(buffer, 0, buffer.Length); + Assert.Equal("Content of a.txt.\r\n", GetString(buffer, read)); + + await Assert.ThrowsAsync(async () => + { + // we'll be unable to ensure enough bytes are buffered to even contain a final boundary + section = await reader.ReadNextSectionAsync(); + }); + } + [Fact] public async Task MutipartReader_ReadInvalidUtf8Header_ReplacementCharacters() {