Faster SubMatch implementation
Submatch has been sped up by implementing a modified Boyer–Moore–Horspool algorithm with an average-case complexity of O(N) on random text. Worst case, it behaves similarly to the previous implementation O(MN), where M is the length of the boundary and N is the length of the buffer to operate on. Method SubMatch looks for two things: 1. Whether the byte array segment fully contains the boundary, or 2. Whether the byte array ends with the start of the boundary. Case 1 is now a lot faster than the previous implementation. Case 2 remains using the same code as before. The method will do Case 1 until the matchOffset is equal to N-M. It then switches to Case 2, unless a match is found. The code can be further sped up with a full Boyer–Moore implementation, or something more sophisticated. This however can be evaluated in the case that this implementation is insufficiently performant for our main scenarios. This commit resolves issue #575.
This commit is contained in:
parent
97940747c3
commit
15649b7e31
|
|
@ -0,0 +1,72 @@
|
|||
// Copyright (c) .NET Foundation. All rights reserved.
|
||||
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
|
||||
|
||||
using System;
|
||||
using System.Text;
|
||||
|
||||
namespace Microsoft.AspNetCore.WebUtilities
|
||||
{
|
||||
internal class MultipartBoundary
|
||||
{
|
||||
private readonly int[] _skipTable = new int[256];
|
||||
private readonly string _boundary;
|
||||
private bool _expectLeadingCrlf;
|
||||
|
||||
public MultipartBoundary(string boundary, bool expectLeadingCrlf = true)
|
||||
{
|
||||
if (boundary == null)
|
||||
{
|
||||
throw new ArgumentNullException(nameof(boundary));
|
||||
}
|
||||
|
||||
_boundary = boundary;
|
||||
_expectLeadingCrlf = expectLeadingCrlf;
|
||||
Initialize(_boundary, _expectLeadingCrlf);
|
||||
}
|
||||
|
||||
private void Initialize(string boundary, bool expectLeadingCrlf)
|
||||
{
|
||||
if (expectLeadingCrlf)
|
||||
{
|
||||
BoundaryBytes = Encoding.UTF8.GetBytes("\r\n--" + boundary);
|
||||
}
|
||||
else
|
||||
{
|
||||
BoundaryBytes = Encoding.UTF8.GetBytes("--" + boundary);
|
||||
}
|
||||
FinalBoundaryLength = BoundaryBytes.Length + 2; // Include the final '--' terminator.
|
||||
|
||||
var length = BoundaryBytes.Length;
|
||||
for (var i = 0; i < _skipTable.Length; ++i)
|
||||
{
|
||||
_skipTable[i] = length;
|
||||
}
|
||||
for (var i = 0; i < length; ++i)
|
||||
{
|
||||
_skipTable[BoundaryBytes[i]] = Math.Max(1, length - 1 - i);
|
||||
}
|
||||
}
|
||||
|
||||
public int GetSkipValue(byte input)
|
||||
{
|
||||
return _skipTable[input];
|
||||
}
|
||||
|
||||
public bool ExpectLeadingCrlf
|
||||
{
|
||||
get { return _expectLeadingCrlf; }
|
||||
set
|
||||
{
|
||||
if (value != _expectLeadingCrlf)
|
||||
{
|
||||
_expectLeadingCrlf = value;
|
||||
Initialize(_boundary, _expectLeadingCrlf);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public byte[] BoundaryBytes { get; private set; }
|
||||
|
||||
public int FinalBoundaryLength { get; private set; }
|
||||
}
|
||||
}
|
||||
|
|
@ -17,7 +17,7 @@ namespace Microsoft.AspNetCore.WebUtilities
|
|||
private const int DefaultBufferSize = 1024 * 4;
|
||||
|
||||
private readonly BufferedReadStream _stream;
|
||||
private readonly string _boundary;
|
||||
private readonly MultipartBoundary _boundary;
|
||||
private MultipartReaderStream _currentStream;
|
||||
|
||||
public MultipartReader(string boundary, Stream stream)
|
||||
|
|
@ -42,9 +42,9 @@ namespace Microsoft.AspNetCore.WebUtilities
|
|||
throw new ArgumentOutOfRangeException(nameof(bufferSize), bufferSize, "Insufficient buffer space, the buffer must be larger than the boundary: " + boundary);
|
||||
}
|
||||
_stream = new BufferedReadStream(stream, bufferSize);
|
||||
_boundary = boundary;
|
||||
_boundary = new MultipartBoundary(boundary, false);
|
||||
// This stream will drain any preamble data and remove the first boundary marker.
|
||||
_currentStream = new MultipartReaderStream(_stream, _boundary, expectLeadingCrlf: false);
|
||||
_currentStream = new MultipartReaderStream(_stream, _boundary);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
|
@ -69,6 +69,7 @@ namespace Microsoft.AspNetCore.WebUtilities
|
|||
return null;
|
||||
}
|
||||
var headers = await ReadHeadersAsync(cancellationToken);
|
||||
_boundary.ExpectLeadingCrlf = true;
|
||||
_currentStream = new MultipartReaderStream(_stream, _boundary);
|
||||
long? baseStreamOffset = _stream.CanSeek ? (long?)_stream.Position : null;
|
||||
return new MultipartSection() { Headers = headers, Body = _currentStream, BaseStreamOffset = baseStreamOffset };
|
||||
|
|
|
|||
|
|
@ -4,7 +4,6 @@
|
|||
using System;
|
||||
using System.Diagnostics;
|
||||
using System.IO;
|
||||
using System.Text;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
|
||||
|
|
@ -12,9 +11,8 @@ namespace Microsoft.AspNetCore.WebUtilities
|
|||
{
|
||||
internal class MultipartReaderStream : Stream
|
||||
{
|
||||
private readonly MultipartBoundary _boundary;
|
||||
private readonly BufferedReadStream _innerStream;
|
||||
private readonly byte[] _boundaryBytes;
|
||||
private readonly int _finalBoundaryLength;
|
||||
private readonly long _innerOffset;
|
||||
private long _position;
|
||||
private long _observedLength;
|
||||
|
|
@ -25,8 +23,7 @@ namespace Microsoft.AspNetCore.WebUtilities
|
|||
/// </summary>
|
||||
/// <param name="stream">The <see cref="BufferedReadStream"/>.</param>
|
||||
/// <param name="boundary">The boundary pattern to use.</param>
|
||||
/// <param name="expectLeadingCrlf">Specifies whether a leading crlf should be expected.</param>
|
||||
public MultipartReaderStream(BufferedReadStream stream, string boundary, bool expectLeadingCrlf = true)
|
||||
public MultipartReaderStream(BufferedReadStream stream, MultipartBoundary boundary)
|
||||
{
|
||||
if (stream == null)
|
||||
{
|
||||
|
|
@ -40,15 +37,7 @@ namespace Microsoft.AspNetCore.WebUtilities
|
|||
|
||||
_innerStream = stream;
|
||||
_innerOffset = _innerStream.CanSeek ? _innerStream.Position : 0;
|
||||
if (expectLeadingCrlf)
|
||||
{
|
||||
_boundaryBytes = Encoding.UTF8.GetBytes("\r\n--" + boundary);
|
||||
}
|
||||
else
|
||||
{
|
||||
_boundaryBytes = Encoding.UTF8.GetBytes("--" + boundary);
|
||||
}
|
||||
_finalBoundaryLength = _boundaryBytes.Length + 2; // Include the final '--' terminator.
|
||||
_boundary = boundary;
|
||||
}
|
||||
|
||||
public bool FinalBoundaryFound { get; private set; }
|
||||
|
|
@ -205,7 +194,7 @@ namespace Microsoft.AspNetCore.WebUtilities
|
|||
}
|
||||
|
||||
PositionInnerStream();
|
||||
if (!_innerStream.EnsureBuffered(_finalBoundaryLength))
|
||||
if (!_innerStream.EnsureBuffered(_boundary.FinalBoundaryLength))
|
||||
{
|
||||
throw new IOException("Unexpected end of stream.");
|
||||
}
|
||||
|
|
@ -215,7 +204,7 @@ namespace Microsoft.AspNetCore.WebUtilities
|
|||
int matchOffset;
|
||||
int matchCount;
|
||||
int read;
|
||||
if (SubMatch(bufferedData, _boundaryBytes, out matchOffset, out matchCount))
|
||||
if (SubMatch(bufferedData, _boundary.BoundaryBytes, out matchOffset, out matchCount))
|
||||
{
|
||||
// We found a possible match, return any data before it.
|
||||
if (matchOffset > bufferedData.Offset)
|
||||
|
|
@ -223,12 +212,12 @@ namespace Microsoft.AspNetCore.WebUtilities
|
|||
read = _innerStream.Read(buffer, offset, Math.Min(count, matchOffset - bufferedData.Offset));
|
||||
return UpdatePosition(read);
|
||||
}
|
||||
Debug.Assert(matchCount == _boundaryBytes.Length);
|
||||
Debug.Assert(matchCount == _boundary.BoundaryBytes.Length);
|
||||
|
||||
// "The boundary may be followed by zero or more characters of
|
||||
// linear whitespace. It is then terminated by either another CRLF"
|
||||
// or -- for the final boundary.
|
||||
byte[] boundary = new byte[_boundaryBytes.Length];
|
||||
byte[] boundary = new byte[_boundary.BoundaryBytes.Length];
|
||||
read = _innerStream.Read(boundary, 0, boundary.Length);
|
||||
Debug.Assert(read == boundary.Length); // It should have all been buffered
|
||||
var remainder = _innerStream.ReadLine(lengthLimit: 100); // Whitespace may exceed the buffer.
|
||||
|
|
@ -256,7 +245,7 @@ namespace Microsoft.AspNetCore.WebUtilities
|
|||
}
|
||||
|
||||
PositionInnerStream();
|
||||
if (!await _innerStream.EnsureBufferedAsync(_finalBoundaryLength, cancellationToken))
|
||||
if (!await _innerStream.EnsureBufferedAsync(_boundary.FinalBoundaryLength, cancellationToken))
|
||||
{
|
||||
throw new IOException("Unexpected end of stream.");
|
||||
}
|
||||
|
|
@ -266,7 +255,7 @@ namespace Microsoft.AspNetCore.WebUtilities
|
|||
int matchOffset;
|
||||
int matchCount;
|
||||
int read;
|
||||
if (SubMatch(bufferedData, _boundaryBytes, out matchOffset, out matchCount))
|
||||
if (SubMatch(bufferedData, _boundary.BoundaryBytes, out matchOffset, out matchCount))
|
||||
{
|
||||
// We found a possible match, return any data before it.
|
||||
if (matchOffset > bufferedData.Offset)
|
||||
|
|
@ -275,12 +264,12 @@ namespace Microsoft.AspNetCore.WebUtilities
|
|||
read = _innerStream.Read(buffer, offset, Math.Min(count, matchOffset - bufferedData.Offset));
|
||||
return UpdatePosition(read);
|
||||
}
|
||||
Debug.Assert(matchCount == _boundaryBytes.Length);
|
||||
Debug.Assert(matchCount == _boundary.BoundaryBytes.Length);
|
||||
|
||||
// "The boundary may be followed by zero or more characters of
|
||||
// linear whitespace. It is then terminated by either another CRLF"
|
||||
// or -- for the final boundary.
|
||||
byte[] boundary = new byte[_boundaryBytes.Length];
|
||||
byte[] boundary = new byte[_boundary.BoundaryBytes.Length];
|
||||
read = _innerStream.Read(boundary, 0, boundary.Length);
|
||||
Debug.Assert(read == boundary.Length); // It should have all been buffered
|
||||
var remainder = await _innerStream.ReadLineAsync(lengthLimit: 100, cancellationToken: cancellationToken); // Whitespace may exceed the buffer.
|
||||
|
|
@ -300,18 +289,44 @@ namespace Microsoft.AspNetCore.WebUtilities
|
|||
return UpdatePosition(read);
|
||||
}
|
||||
|
||||
// Does Segment1 contain all of segment2, or does it end with the start of segment2?
|
||||
// Does segment1 contain all of matchBytes, or does it end with the start of matchBytes?
|
||||
// 1: AAAAABBBBBCCCCC
|
||||
// 2: BBBBB
|
||||
// Or:
|
||||
// 1: AAAAABBB
|
||||
// 2: BBBBB
|
||||
private static bool SubMatch(ArraySegment<byte> segment1, byte[] matchBytes, out int matchOffset, out int matchCount)
|
||||
private bool SubMatch(ArraySegment<byte> segment1, byte[] matchBytes, out int matchOffset, out int matchCount)
|
||||
{
|
||||
// clear matchCount to zero
|
||||
matchCount = 0;
|
||||
for (matchOffset = segment1.Offset; matchOffset < segment1.Offset + segment1.Count; matchOffset++)
|
||||
|
||||
// case 1: does segment1 fully contain matchBytes?
|
||||
{
|
||||
int countLimit = segment1.Offset - matchOffset + segment1.Count;
|
||||
var matchBytesLengthMinusOne = matchBytes.Length - 1;
|
||||
var matchBytesLastByte = matchBytes[matchBytesLengthMinusOne];
|
||||
var segmentEndMinusMatchBytesLength = segment1.Offset + segment1.Count - matchBytes.Length;
|
||||
|
||||
matchOffset = segment1.Offset;
|
||||
while (matchOffset < segmentEndMinusMatchBytesLength)
|
||||
{
|
||||
var lookaheadTailChar = segment1.Array[matchOffset + matchBytesLengthMinusOne];
|
||||
if (lookaheadTailChar == matchBytesLastByte &&
|
||||
CompareBuffers(segment1.Array, matchOffset, matchBytes, 0, matchBytesLengthMinusOne) == 0)
|
||||
{
|
||||
matchCount = matchBytes.Length;
|
||||
return true;
|
||||
}
|
||||
matchOffset += _boundary.GetSkipValue(lookaheadTailChar);
|
||||
}
|
||||
}
|
||||
|
||||
// case 2: does segment1 end with the start of matchBytes?
|
||||
var segmentEnd = segment1.Offset + segment1.Count;
|
||||
|
||||
matchCount = 0;
|
||||
for (; matchOffset < segmentEnd; matchOffset++)
|
||||
{
|
||||
var countLimit = segmentEnd - matchOffset;
|
||||
for (matchCount = 0; matchCount < matchBytes.Length && matchCount < countLimit; matchCount++)
|
||||
{
|
||||
if (matchBytes[matchCount] != segment1.Array[matchOffset + matchCount])
|
||||
|
|
@ -327,5 +342,17 @@ namespace Microsoft.AspNetCore.WebUtilities
|
|||
}
|
||||
return matchCount > 0;
|
||||
}
|
||||
|
||||
private static int CompareBuffers(byte[] buffer1, int offset1, byte[] buffer2, int offset2, int count)
|
||||
{
|
||||
for (; count-- > 0; offset1++, offset2++)
|
||||
{
|
||||
if (buffer1[offset1] != buffer2[offset2])
|
||||
{
|
||||
return buffer1[offset1] - buffer2[offset2];
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
// Copyright (c) .NET Foundation. All rights reserved.
|
||||
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
|
||||
|
||||
using System;
|
||||
using System.IO;
|
||||
using System.Text;
|
||||
using System.Threading.Tasks;
|
||||
|
|
@ -74,11 +75,29 @@ namespace Microsoft.AspNetCore.WebUtilities
|
|||
"\r\n" +
|
||||
"--9051914041544843365972754266--\r\n";
|
||||
|
||||
private const string TwoPartBodyIncompleteBuffer =
|
||||
"--9051914041544843365972754266\r\n" +
|
||||
"Content-Disposition: form-data; name=\"text\"\r\n" +
|
||||
"\r\n" +
|
||||
"text default\r\n" +
|
||||
"--9051914041544843365972754266\r\n" +
|
||||
"Content-Disposition: form-data; name=\"file1\"; filename=\"a.txt\"\r\n" +
|
||||
"Content-Type: text/plain\r\n" +
|
||||
"\r\n" +
|
||||
"Content of a.txt.\r\n" +
|
||||
"\r\n" +
|
||||
"--9051914041544843365";
|
||||
|
||||
private static MemoryStream MakeStream(string text)
|
||||
{
|
||||
return new MemoryStream(Encoding.UTF8.GetBytes(text));
|
||||
}
|
||||
|
||||
private static string GetString(byte[] buffer, int count)
|
||||
{
|
||||
return Encoding.ASCII.GetString(buffer, 0, count);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task MutipartReader_ReadSinglePartBody_Success()
|
||||
{
|
||||
|
|
@ -217,6 +236,47 @@ namespace Microsoft.AspNetCore.WebUtilities
|
|||
Assert.Null(await reader.ReadNextSectionAsync());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void MutipartReader_BufferSizeMustBeLargerThanBoundary_Throws()
|
||||
{
|
||||
var stream = MakeStream(ThreePartBody);
|
||||
Assert.Throws<ArgumentOutOfRangeException>(() =>
|
||||
{
|
||||
var reader = new MultipartReader(Boundary, stream, 5);
|
||||
});
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task MutipartReader_TwoPartBodyIncompleteBuffer_TwoSectionsReadSuccessfullyThirdSectionThrows()
|
||||
{
|
||||
var stream = MakeStream(TwoPartBodyIncompleteBuffer);
|
||||
var reader = new MultipartReader(Boundary, stream);
|
||||
var buffer = new byte[128];
|
||||
|
||||
//first section can be read successfully
|
||||
var section = await reader.ReadNextSectionAsync();
|
||||
Assert.NotNull(section);
|
||||
Assert.Equal(1, section.Headers.Count);
|
||||
Assert.Equal("form-data; name=\"text\"", section.Headers["Content-Disposition"][0]);
|
||||
var read = section.Body.Read(buffer, 0, buffer.Length);
|
||||
Assert.Equal("text default", GetString(buffer, read));
|
||||
|
||||
//second section can be read successfully (even though the bottom boundary is truncated)
|
||||
section = await reader.ReadNextSectionAsync();
|
||||
Assert.NotNull(section);
|
||||
Assert.Equal(2, section.Headers.Count);
|
||||
Assert.Equal("form-data; name=\"file1\"; filename=\"a.txt\"", section.Headers["Content-Disposition"][0]);
|
||||
Assert.Equal("text/plain", section.Headers["Content-Type"][0]);
|
||||
read = section.Body.Read(buffer, 0, buffer.Length);
|
||||
Assert.Equal("Content of a.txt.\r\n", GetString(buffer, read));
|
||||
|
||||
await Assert.ThrowsAsync<IOException>(async () =>
|
||||
{
|
||||
// we'll be unable to ensure enough bytes are buffered to even contain a final boundary
|
||||
section = await reader.ReadNextSectionAsync();
|
||||
});
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task MutipartReader_ReadInvalidUtf8Header_ReplacementCharacters()
|
||||
{
|
||||
|
|
|
|||
Loading…
Reference in New Issue