Faster SubMatch implementation

Submatch has been sped up by implementing a modified Boyer–Moore–Horspool algorithm with an average-case complexity of O(N) on random text. Worst case, it behaves similarly to the previous implementation O(MN), where M is the length of the boundary and N is the length of the buffer to operate on.

Method SubMatch looks for two things:

  1. Whether the byte array segment fully contains the boundary, or
  2. Whether the byte array ends with the start of the boundary.

Case 1 is now a lot faster than the previous implementation. Case 2 remains using the same code as before. The method will do Case 1 until the matchOffset is equal to N-M. It then switches to Case 2, unless a match is found.

The code can be further sped up with a full Boyer–Moore implementation, or something more sophisticated. This however can be evaluated in the case that this implementation is insufficiently performant for our main scenarios.

This commit resolves issue #575.
This commit is contained in:
David Obando 2016-03-04 10:41:27 -08:00
parent 97940747c3
commit 15649b7e31
4 changed files with 189 additions and 29 deletions

View File

@ -0,0 +1,72 @@
// Copyright (c) .NET Foundation. All rights reserved.
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
using System;
using System.Text;
namespace Microsoft.AspNetCore.WebUtilities
{
internal class MultipartBoundary
{
private readonly int[] _skipTable = new int[256];
private readonly string _boundary;
private bool _expectLeadingCrlf;
public MultipartBoundary(string boundary, bool expectLeadingCrlf = true)
{
if (boundary == null)
{
throw new ArgumentNullException(nameof(boundary));
}
_boundary = boundary;
_expectLeadingCrlf = expectLeadingCrlf;
Initialize(_boundary, _expectLeadingCrlf);
}
private void Initialize(string boundary, bool expectLeadingCrlf)
{
if (expectLeadingCrlf)
{
BoundaryBytes = Encoding.UTF8.GetBytes("\r\n--" + boundary);
}
else
{
BoundaryBytes = Encoding.UTF8.GetBytes("--" + boundary);
}
FinalBoundaryLength = BoundaryBytes.Length + 2; // Include the final '--' terminator.
var length = BoundaryBytes.Length;
for (var i = 0; i < _skipTable.Length; ++i)
{
_skipTable[i] = length;
}
for (var i = 0; i < length; ++i)
{
_skipTable[BoundaryBytes[i]] = Math.Max(1, length - 1 - i);
}
}
public int GetSkipValue(byte input)
{
return _skipTable[input];
}
public bool ExpectLeadingCrlf
{
get { return _expectLeadingCrlf; }
set
{
if (value != _expectLeadingCrlf)
{
_expectLeadingCrlf = value;
Initialize(_boundary, _expectLeadingCrlf);
}
}
}
public byte[] BoundaryBytes { get; private set; }
public int FinalBoundaryLength { get; private set; }
}
}

View File

@ -17,7 +17,7 @@ namespace Microsoft.AspNetCore.WebUtilities
private const int DefaultBufferSize = 1024 * 4;
private readonly BufferedReadStream _stream;
private readonly string _boundary;
private readonly MultipartBoundary _boundary;
private MultipartReaderStream _currentStream;
public MultipartReader(string boundary, Stream stream)
@ -42,9 +42,9 @@ namespace Microsoft.AspNetCore.WebUtilities
throw new ArgumentOutOfRangeException(nameof(bufferSize), bufferSize, "Insufficient buffer space, the buffer must be larger than the boundary: " + boundary);
}
_stream = new BufferedReadStream(stream, bufferSize);
_boundary = boundary;
_boundary = new MultipartBoundary(boundary, false);
// This stream will drain any preamble data and remove the first boundary marker.
_currentStream = new MultipartReaderStream(_stream, _boundary, expectLeadingCrlf: false);
_currentStream = new MultipartReaderStream(_stream, _boundary);
}
/// <summary>
@ -69,6 +69,7 @@ namespace Microsoft.AspNetCore.WebUtilities
return null;
}
var headers = await ReadHeadersAsync(cancellationToken);
_boundary.ExpectLeadingCrlf = true;
_currentStream = new MultipartReaderStream(_stream, _boundary);
long? baseStreamOffset = _stream.CanSeek ? (long?)_stream.Position : null;
return new MultipartSection() { Headers = headers, Body = _currentStream, BaseStreamOffset = baseStreamOffset };

View File

@ -4,7 +4,6 @@
using System;
using System.Diagnostics;
using System.IO;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
@ -12,9 +11,8 @@ namespace Microsoft.AspNetCore.WebUtilities
{
internal class MultipartReaderStream : Stream
{
private readonly MultipartBoundary _boundary;
private readonly BufferedReadStream _innerStream;
private readonly byte[] _boundaryBytes;
private readonly int _finalBoundaryLength;
private readonly long _innerOffset;
private long _position;
private long _observedLength;
@ -25,8 +23,7 @@ namespace Microsoft.AspNetCore.WebUtilities
/// </summary>
/// <param name="stream">The <see cref="BufferedReadStream"/>.</param>
/// <param name="boundary">The boundary pattern to use.</param>
/// <param name="expectLeadingCrlf">Specifies whether a leading crlf should be expected.</param>
public MultipartReaderStream(BufferedReadStream stream, string boundary, bool expectLeadingCrlf = true)
public MultipartReaderStream(BufferedReadStream stream, MultipartBoundary boundary)
{
if (stream == null)
{
@ -40,15 +37,7 @@ namespace Microsoft.AspNetCore.WebUtilities
_innerStream = stream;
_innerOffset = _innerStream.CanSeek ? _innerStream.Position : 0;
if (expectLeadingCrlf)
{
_boundaryBytes = Encoding.UTF8.GetBytes("\r\n--" + boundary);
}
else
{
_boundaryBytes = Encoding.UTF8.GetBytes("--" + boundary);
}
_finalBoundaryLength = _boundaryBytes.Length + 2; // Include the final '--' terminator.
_boundary = boundary;
}
public bool FinalBoundaryFound { get; private set; }
@ -205,7 +194,7 @@ namespace Microsoft.AspNetCore.WebUtilities
}
PositionInnerStream();
if (!_innerStream.EnsureBuffered(_finalBoundaryLength))
if (!_innerStream.EnsureBuffered(_boundary.FinalBoundaryLength))
{
throw new IOException("Unexpected end of stream.");
}
@ -215,7 +204,7 @@ namespace Microsoft.AspNetCore.WebUtilities
int matchOffset;
int matchCount;
int read;
if (SubMatch(bufferedData, _boundaryBytes, out matchOffset, out matchCount))
if (SubMatch(bufferedData, _boundary.BoundaryBytes, out matchOffset, out matchCount))
{
// We found a possible match, return any data before it.
if (matchOffset > bufferedData.Offset)
@ -223,12 +212,12 @@ namespace Microsoft.AspNetCore.WebUtilities
read = _innerStream.Read(buffer, offset, Math.Min(count, matchOffset - bufferedData.Offset));
return UpdatePosition(read);
}
Debug.Assert(matchCount == _boundaryBytes.Length);
Debug.Assert(matchCount == _boundary.BoundaryBytes.Length);
// "The boundary may be followed by zero or more characters of
// linear whitespace. It is then terminated by either another CRLF"
// or -- for the final boundary.
byte[] boundary = new byte[_boundaryBytes.Length];
byte[] boundary = new byte[_boundary.BoundaryBytes.Length];
read = _innerStream.Read(boundary, 0, boundary.Length);
Debug.Assert(read == boundary.Length); // It should have all been buffered
var remainder = _innerStream.ReadLine(lengthLimit: 100); // Whitespace may exceed the buffer.
@ -256,7 +245,7 @@ namespace Microsoft.AspNetCore.WebUtilities
}
PositionInnerStream();
if (!await _innerStream.EnsureBufferedAsync(_finalBoundaryLength, cancellationToken))
if (!await _innerStream.EnsureBufferedAsync(_boundary.FinalBoundaryLength, cancellationToken))
{
throw new IOException("Unexpected end of stream.");
}
@ -266,7 +255,7 @@ namespace Microsoft.AspNetCore.WebUtilities
int matchOffset;
int matchCount;
int read;
if (SubMatch(bufferedData, _boundaryBytes, out matchOffset, out matchCount))
if (SubMatch(bufferedData, _boundary.BoundaryBytes, out matchOffset, out matchCount))
{
// We found a possible match, return any data before it.
if (matchOffset > bufferedData.Offset)
@ -275,12 +264,12 @@ namespace Microsoft.AspNetCore.WebUtilities
read = _innerStream.Read(buffer, offset, Math.Min(count, matchOffset - bufferedData.Offset));
return UpdatePosition(read);
}
Debug.Assert(matchCount == _boundaryBytes.Length);
Debug.Assert(matchCount == _boundary.BoundaryBytes.Length);
// "The boundary may be followed by zero or more characters of
// linear whitespace. It is then terminated by either another CRLF"
// or -- for the final boundary.
byte[] boundary = new byte[_boundaryBytes.Length];
byte[] boundary = new byte[_boundary.BoundaryBytes.Length];
read = _innerStream.Read(boundary, 0, boundary.Length);
Debug.Assert(read == boundary.Length); // It should have all been buffered
var remainder = await _innerStream.ReadLineAsync(lengthLimit: 100, cancellationToken: cancellationToken); // Whitespace may exceed the buffer.
@ -300,18 +289,44 @@ namespace Microsoft.AspNetCore.WebUtilities
return UpdatePosition(read);
}
// Does Segment1 contain all of segment2, or does it end with the start of segment2?
// Does segment1 contain all of matchBytes, or does it end with the start of matchBytes?
// 1: AAAAABBBBBCCCCC
// 2: BBBBB
// Or:
// 1: AAAAABBB
// 2: BBBBB
private static bool SubMatch(ArraySegment<byte> segment1, byte[] matchBytes, out int matchOffset, out int matchCount)
private bool SubMatch(ArraySegment<byte> segment1, byte[] matchBytes, out int matchOffset, out int matchCount)
{
// clear matchCount to zero
matchCount = 0;
for (matchOffset = segment1.Offset; matchOffset < segment1.Offset + segment1.Count; matchOffset++)
// case 1: does segment1 fully contain matchBytes?
{
int countLimit = segment1.Offset - matchOffset + segment1.Count;
var matchBytesLengthMinusOne = matchBytes.Length - 1;
var matchBytesLastByte = matchBytes[matchBytesLengthMinusOne];
var segmentEndMinusMatchBytesLength = segment1.Offset + segment1.Count - matchBytes.Length;
matchOffset = segment1.Offset;
while (matchOffset < segmentEndMinusMatchBytesLength)
{
var lookaheadTailChar = segment1.Array[matchOffset + matchBytesLengthMinusOne];
if (lookaheadTailChar == matchBytesLastByte &&
CompareBuffers(segment1.Array, matchOffset, matchBytes, 0, matchBytesLengthMinusOne) == 0)
{
matchCount = matchBytes.Length;
return true;
}
matchOffset += _boundary.GetSkipValue(lookaheadTailChar);
}
}
// case 2: does segment1 end with the start of matchBytes?
var segmentEnd = segment1.Offset + segment1.Count;
matchCount = 0;
for (; matchOffset < segmentEnd; matchOffset++)
{
var countLimit = segmentEnd - matchOffset;
for (matchCount = 0; matchCount < matchBytes.Length && matchCount < countLimit; matchCount++)
{
if (matchBytes[matchCount] != segment1.Array[matchOffset + matchCount])
@ -327,5 +342,17 @@ namespace Microsoft.AspNetCore.WebUtilities
}
return matchCount > 0;
}
private static int CompareBuffers(byte[] buffer1, int offset1, byte[] buffer2, int offset2, int count)
{
for (; count-- > 0; offset1++, offset2++)
{
if (buffer1[offset1] != buffer2[offset2])
{
return buffer1[offset1] - buffer2[offset2];
}
}
return 0;
}
}
}

View File

@ -1,6 +1,7 @@
// Copyright (c) .NET Foundation. All rights reserved.
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
using System;
using System.IO;
using System.Text;
using System.Threading.Tasks;
@ -74,11 +75,29 @@ namespace Microsoft.AspNetCore.WebUtilities
"\r\n" +
"--9051914041544843365972754266--\r\n";
private const string TwoPartBodyIncompleteBuffer =
"--9051914041544843365972754266\r\n" +
"Content-Disposition: form-data; name=\"text\"\r\n" +
"\r\n" +
"text default\r\n" +
"--9051914041544843365972754266\r\n" +
"Content-Disposition: form-data; name=\"file1\"; filename=\"a.txt\"\r\n" +
"Content-Type: text/plain\r\n" +
"\r\n" +
"Content of a.txt.\r\n" +
"\r\n" +
"--9051914041544843365";
private static MemoryStream MakeStream(string text)
{
return new MemoryStream(Encoding.UTF8.GetBytes(text));
}
private static string GetString(byte[] buffer, int count)
{
return Encoding.ASCII.GetString(buffer, 0, count);
}
[Fact]
public async Task MutipartReader_ReadSinglePartBody_Success()
{
@ -217,6 +236,47 @@ namespace Microsoft.AspNetCore.WebUtilities
Assert.Null(await reader.ReadNextSectionAsync());
}
[Fact]
public void MutipartReader_BufferSizeMustBeLargerThanBoundary_Throws()
{
var stream = MakeStream(ThreePartBody);
Assert.Throws<ArgumentOutOfRangeException>(() =>
{
var reader = new MultipartReader(Boundary, stream, 5);
});
}
[Fact]
public async Task MutipartReader_TwoPartBodyIncompleteBuffer_TwoSectionsReadSuccessfullyThirdSectionThrows()
{
var stream = MakeStream(TwoPartBodyIncompleteBuffer);
var reader = new MultipartReader(Boundary, stream);
var buffer = new byte[128];
//first section can be read successfully
var section = await reader.ReadNextSectionAsync();
Assert.NotNull(section);
Assert.Equal(1, section.Headers.Count);
Assert.Equal("form-data; name=\"text\"", section.Headers["Content-Disposition"][0]);
var read = section.Body.Read(buffer, 0, buffer.Length);
Assert.Equal("text default", GetString(buffer, read));
//second section can be read successfully (even though the bottom boundary is truncated)
section = await reader.ReadNextSectionAsync();
Assert.NotNull(section);
Assert.Equal(2, section.Headers.Count);
Assert.Equal("form-data; name=\"file1\"; filename=\"a.txt\"", section.Headers["Content-Disposition"][0]);
Assert.Equal("text/plain", section.Headers["Content-Type"][0]);
read = section.Body.Read(buffer, 0, buffer.Length);
Assert.Equal("Content of a.txt.\r\n", GetString(buffer, read));
await Assert.ThrowsAsync<IOException>(async () =>
{
// we'll be unable to ensure enough bytes are buffered to even contain a final boundary
section = await reader.ReadNextSectionAsync();
});
}
[Fact]
public async Task MutipartReader_ReadInvalidUtf8Header_ReplacementCharacters()
{