[Infrastructure] Adds support for capturing process dumps for hanging builds on Windows (#13912)
* Downloads and installs ProcDump as part of the build. * Registers a scheduled job that wakes up in 160 minutes. * Upon waking up, the background job lists all the processes it received as candidates. * For each process it captures a full memory dump. * At the end of the build, a separate step checks on whether the job ran (the build hanged) or everything is correct and displays statistics. * If dumps are found, they are collected and made available as artifacts under Windows(_Templates)_Tests_Logs.
This commit is contained in:
parent
8b7f662169
commit
85af1fd66b
|
|
@ -126,6 +126,11 @@ jobs:
|
||||||
steps:
|
steps:
|
||||||
- checkout: self
|
- checkout: self
|
||||||
clean: true
|
clean: true
|
||||||
|
- ${{ if and(eq(parameters.agentOs, 'Windows'), eq(parameters.isTestingJob, true)) }}:
|
||||||
|
- powershell: ./eng/scripts/InstallProcDump.ps1
|
||||||
|
displayName: Install ProcDump
|
||||||
|
- powershell: ./eng/scripts/StartDumpCollectionForHangingBuilds.ps1 $(ProcDumpPath)procdump.exe artifacts/log/ (Get-Date).AddMinutes(160) dotnet
|
||||||
|
displayName: Start background dump collection
|
||||||
- ${{ if eq(parameters.installNodeJs, 'true') }}:
|
- ${{ if eq(parameters.installNodeJs, 'true') }}:
|
||||||
- task: NodeTool@0
|
- task: NodeTool@0
|
||||||
displayName: Install Node 10.x
|
displayName: Install Node 10.x
|
||||||
|
|
@ -165,6 +170,12 @@ jobs:
|
||||||
|
|
||||||
- ${{ parameters.afterBuild }}
|
- ${{ parameters.afterBuild }}
|
||||||
|
|
||||||
|
- ${{ if and(eq(parameters.agentOs, 'Windows'), eq(parameters.isTestingJob, true)) }}:
|
||||||
|
- powershell: ./eng/scripts/FinishDumpCollectionForHangingBuilds.ps1 artifacts/log/
|
||||||
|
displayName: Finish background dump collection
|
||||||
|
continueOnError: true
|
||||||
|
condition: always()
|
||||||
|
|
||||||
- ${{ if eq(parameters.agentOs, 'Windows') }}:
|
- ${{ if eq(parameters.agentOs, 'Windows') }}:
|
||||||
- powershell: eng\scripts\KillProcesses.ps1
|
- powershell: eng\scripts\KillProcesses.ps1
|
||||||
displayName: Kill processes
|
displayName: Kill processes
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,76 @@
|
||||||
|
param(
|
||||||
|
[Parameter(Mandatory = $true)]
|
||||||
|
[ValidateNotNullOrEmpty()]
|
||||||
|
[string]
|
||||||
|
$ProcDumpOutputPath
|
||||||
|
)
|
||||||
|
|
||||||
|
Write-Output "Finishing dump collection for hanging builds.";
|
||||||
|
|
||||||
|
$repoRoot = Resolve-Path "$PSScriptRoot\..\..";
|
||||||
|
$ProcDumpOutputPath = Join-Path $repoRoot $ProcDumpOutputPath;
|
||||||
|
|
||||||
|
$sentinelFile = Join-Path $ProcDumpOutputPath "dump-sentinel.txt";
|
||||||
|
if ((-not (Test-Path $sentinelFile))) {
|
||||||
|
Write-Output "No sentinel file available in '$sentinelFile'. " +
|
||||||
|
"StartDumpCollectionForHangingBuilds.ps1 has not been executed, is not correctly configured or failed before creating the sentinel file.";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
Get-Process "procdump" -ErrorAction SilentlyContinue | ForEach-Object { Write-Output "ProcDump with PID $($_.Id) is still running."; };
|
||||||
|
|
||||||
|
$capturedDumps = Get-ChildItem $ProcDumpOutputPath -Filter *.dmp;
|
||||||
|
$capturedDumps | ForEach-Object { Write-Output "Found captured dump $_"; };
|
||||||
|
|
||||||
|
$JobName = (Get-Content $sentinelFile);
|
||||||
|
|
||||||
|
if ($JobName.Count -ne 1) {
|
||||||
|
if ($JobName.Count -eq 0) {
|
||||||
|
Write-Warning "No job name found. This is likely an error.";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
Write-Output "Multiple job names found '$JobName'.";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
$dumpCollectionJob = Get-Job -Name $JobName -ErrorAction SilentlyContinue;
|
||||||
|
$registeredJob = Get-ScheduledJob -Name $JobName -ErrorAction SilentlyContinue;
|
||||||
|
|
||||||
|
if ($null -eq $dumpCollectionJob) {
|
||||||
|
Write-Output "No job found for '$JobName'. It either didn't run or there is an issue with the job definition.";
|
||||||
|
|
||||||
|
if ($null -eq $registeredJob) {
|
||||||
|
Write-Warning "Couldn't find a scheduled job '$JobName'.";
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
Write-Output "Listing existing jobs";
|
||||||
|
Get-Job -Name CaptureDumps*
|
||||||
|
|
||||||
|
Write-Output "Listing existing scheduled jobs";
|
||||||
|
Get-ScheduledJob -Name CaptureDumps*
|
||||||
|
|
||||||
|
Write-Output "Displaying job output";
|
||||||
|
Receive-Job $dumpCollectionJob;
|
||||||
|
|
||||||
|
Write-Output "Waiting for current job to finish";
|
||||||
|
Get-Job -ErrorAction SilentlyContinue | Wait-Job;
|
||||||
|
|
||||||
|
try {
|
||||||
|
Write-Output "Removing collection job";
|
||||||
|
Remove-Job $dumpCollectionJob;
|
||||||
|
}
|
||||||
|
catch {
|
||||||
|
Write-Output "Failed to remove collection job";
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
Write-Output "Unregistering scheduled job";
|
||||||
|
Unregister-ScheduledJob $registeredJob;
|
||||||
|
}
|
||||||
|
catch {
|
||||||
|
Write-Output "Failed to unregister $JobName";
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,46 @@
|
||||||
|
<#
|
||||||
|
.SYNOPSIS
|
||||||
|
Installs ProcDump into a folder in this repo.
|
||||||
|
.DESCRIPTION
|
||||||
|
This script downloads and extracts the ProcDump.
|
||||||
|
.PARAMETER Force
|
||||||
|
Overwrite the existing installation
|
||||||
|
#>
|
||||||
|
param(
|
||||||
|
[switch]$Force
|
||||||
|
)
|
||||||
|
$ErrorActionPreference = 'Stop'
|
||||||
|
$ProgressPreference = 'SilentlyContinue' # Workaround PowerShell/PowerShell#2138
|
||||||
|
|
||||||
|
Set-StrictMode -Version 1
|
||||||
|
|
||||||
|
$repoRoot = Resolve-Path "$PSScriptRoot\..\.."
|
||||||
|
$installDir = "$repoRoot\.tools\ProcDump\"
|
||||||
|
$tempDir = "$repoRoot\obj"
|
||||||
|
|
||||||
|
if (Test-Path $installDir) {
|
||||||
|
if ($Force) {
|
||||||
|
Remove-Item -Force -Recurse $installDir
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
Write-Host "ProcDump already installed to $installDir. Exiting without action. Call this script again with -Force to overwrite."
|
||||||
|
exit 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Remove-Item -Force -Recurse $tempDir -ErrorAction Ignore | out-null
|
||||||
|
mkdir $tempDir -ea Ignore | out-null
|
||||||
|
mkdir $installDir -ea Ignore | out-null
|
||||||
|
Write-Host "Starting ProcDump download"
|
||||||
|
Invoke-WebRequest -UseBasicParsing -Uri "https://download.sysinternals.com/files/Procdump.zip" -Out "$tempDir/ProcDump.zip"
|
||||||
|
Write-Host "Done downloading ProcDump"
|
||||||
|
Expand-Archive "$tempDir/ProcDump.zip" -d "$tempDir/ProcDump/"
|
||||||
|
Write-Host "Expanded ProcDump to $tempDir"
|
||||||
|
Write-Host "Installing ProcDump to $installDir"
|
||||||
|
Move-Item "$tempDir/ProcDump/*" $installDir
|
||||||
|
Write-Host "Done installing ProcDump to $installDir"
|
||||||
|
|
||||||
|
if ($env:TF_BUILD) {
|
||||||
|
Write-Host "##vso[task.setvariable variable=ProcDumpPath]$installDir"
|
||||||
|
Write-Host "##vso[task.prependpath]$installDir"
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,123 @@
|
||||||
|
param(
|
||||||
|
[Parameter(Mandatory = $true)]
|
||||||
|
[ValidateNotNullOrEmpty()]
|
||||||
|
[string]
|
||||||
|
$ProcDumpPath,
|
||||||
|
[Parameter(Mandatory = $true)]
|
||||||
|
[ValidateNotNullOrEmpty()]
|
||||||
|
[string]
|
||||||
|
$ProcDumpOutputPath,
|
||||||
|
[Parameter(Mandatory = $true)]
|
||||||
|
[datetime]
|
||||||
|
$WakeTime,
|
||||||
|
[Parameter(Mandatory = $true)]
|
||||||
|
[ValidateNotNullOrEmpty()]
|
||||||
|
[string []]
|
||||||
|
$CandidateProcessNames
|
||||||
|
)
|
||||||
|
|
||||||
|
Write-Output "Setting up a scheduled job to capture process dumps.";
|
||||||
|
|
||||||
|
if ((-not (Test-Path $ProcDumpPath))) {
|
||||||
|
Write-Warning "Can't find ProcDump at '$ProcDumpPath'.";
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
Write-Output "Using ProcDump from '$ProcDumpPath'.";
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
$previousJobs = Get-Job -Name CaptureDumps* -ErrorAction SilentlyContinue;
|
||||||
|
$previousScheduledJobs = Get-ScheduledJob CaptureDumps* -ErrorAction SilentlyContinue;
|
||||||
|
|
||||||
|
if ($previousJobs.Count -ne 0) {
|
||||||
|
Write-Output "Found existing dump jobs.";
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($previousScheduledJobs.Count -ne 0) {
|
||||||
|
Write-Output "Found existing dump jobs.";
|
||||||
|
}
|
||||||
|
|
||||||
|
$previousJobs | Stop-Job -PassThru | Remove-Job;
|
||||||
|
$previousScheduledJobs | Unregister-ScheduledJob;
|
||||||
|
}
|
||||||
|
catch {
|
||||||
|
Write-Output "There was an error cleaning up previous jobs.";
|
||||||
|
Write-Output $_.Exception.Message;
|
||||||
|
}
|
||||||
|
|
||||||
|
$repoRoot = Resolve-Path "$PSScriptRoot\..\..";
|
||||||
|
$ProcDumpOutputPath = Join-Path $repoRoot $ProcDumpOutputPath;
|
||||||
|
|
||||||
|
Write-Output "Dumps will be placed at '$ProcDumpOutputPath'.";
|
||||||
|
Write-Output "Watching processes $($CandidateProcessNames -join ', ')";
|
||||||
|
|
||||||
|
# This script registers as a scheduled job. This scheduled job executes after $WakeTime.
|
||||||
|
# When the scheduled job executes, it runs procdump on all alive processes whose name matches $CandidateProcessNames.
|
||||||
|
# The dumps are placed in $ProcDumpOutputPath
|
||||||
|
# If the build completes sucessfully in less than $WakeTime, a final step unregisters the job.
|
||||||
|
|
||||||
|
# Create a unique identifier for the job name
|
||||||
|
$JobName = "CaptureDumps" + (New-Guid).ToString("N");
|
||||||
|
|
||||||
|
# Ensure that the dumps output path exists.
|
||||||
|
if ((-not (Test-Path $ProcDumpOutputPath))) {
|
||||||
|
New-Item -ItemType Directory $ProcDumpOutputPath | Out-Null;
|
||||||
|
}
|
||||||
|
|
||||||
|
# We write a sentinel file that we use at the end of the build to
|
||||||
|
# find the job we started and to determine the results from the sheduled
|
||||||
|
# job (Whether it ran or not and to display the outputs form the job)
|
||||||
|
$sentinelFile = Join-Path $ProcDumpOutputPath "dump-sentinel.txt";
|
||||||
|
Out-File -FilePath $sentinelFile -InputObject $JobName | Out-Null;
|
||||||
|
|
||||||
|
[scriptblock] $ScriptCode = {
|
||||||
|
param(
|
||||||
|
$ProcDumpPath,
|
||||||
|
$ProcDumpOutputPath,
|
||||||
|
$CandidateProcessNames)
|
||||||
|
|
||||||
|
Write-Output "Waking up to capture process dumps. Determining hanging processes.";
|
||||||
|
|
||||||
|
[System.Diagnostics.Process []]$AliveProcesses = @();
|
||||||
|
foreach ($candidate in $CandidateProcessNames) {
|
||||||
|
try {
|
||||||
|
$candidateProcesses = Get-Process $candidate;
|
||||||
|
$candidateProcesses | ForEach-Object { Write-Output "Found candidate process $candidate with PID '$($_.Id)'." };
|
||||||
|
$AliveProcesses += $candidateProcesses;
|
||||||
|
}
|
||||||
|
catch {
|
||||||
|
Write-Output "No process found for $candidate";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Write-Output "Starting process dump capture.";
|
||||||
|
|
||||||
|
$dumpFullPath = [System.IO.Path]::Combine($ProcDumpOutputPath, "hung_PROCESSNAME_PID_YYMMDD_HHMMSS.dmp");
|
||||||
|
|
||||||
|
Write-Output "Capturing output for $($AliveProcesses.Length) processes.";
|
||||||
|
|
||||||
|
foreach ($process in $AliveProcesses) {
|
||||||
|
|
||||||
|
$procDumpArgs = @("-accepteula", "-ma", $process.Id, $dumpFullPath);
|
||||||
|
try {
|
||||||
|
Write-Output "Capturing dump for dump for '$($process.Name)' with PID '$($process.Id)'.";
|
||||||
|
Start-Process -FilePath $ProcDumpPath -ArgumentList $procDumpArgs -NoNewWindow -Wait;
|
||||||
|
}
|
||||||
|
catch {
|
||||||
|
Write-Output "There was an error capturing a process dump for '$($process.Name)' with PID '$($process.Id)'."
|
||||||
|
Write-Warning $_.Exception.Message;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Write-Output "Done capturing process dumps.";
|
||||||
|
}
|
||||||
|
|
||||||
|
$ScriptTrigger = New-JobTrigger -Once -At $WakeTime;
|
||||||
|
|
||||||
|
try {
|
||||||
|
Register-ScheduledJob -Name $JobName -ScriptBlock $ScriptCode -Trigger $ScriptTrigger -ArgumentList $ProcDumpPath, $ProcDumpOutputPath, $CandidateProcessNames;
|
||||||
|
}
|
||||||
|
catch {
|
||||||
|
Write-Warning "Failed to register scheduled job '$JobName'. Dumps will not be captured for build hangs.";
|
||||||
|
Write-Warning $_.Exception.Message;
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue