Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Internal] Client Telemetry: Adds network information in the payload #3691

Merged
merged 17 commits into from
Feb 27, 2023
Merged
5 changes: 3 additions & 2 deletions Microsoft.Azure.Cosmos/src/Handler/TelemetryHandler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,12 @@ public override async Task<ResponseMessage> SendAsync(
resourceType: request.ResourceType,
consistencyLevel: request.Headers?[Documents.HttpConstants.HttpHeaders.ConsistencyLevel],
requestCharge: response.Headers.RequestCharge,
subStatusCode: response.Headers.SubStatusCode);
subStatusCode: response.Headers.SubStatusCode,
trace: response.Trace);
}
catch (Exception ex)
{
DefaultTrace.TraceError("Error while collecting telemetry information : " + ex.Message);
DefaultTrace.TraceError("Error while collecting telemetry information : {0}", ex);
}
}
return response;
Expand Down
73 changes: 60 additions & 13 deletions Microsoft.Azure.Cosmos/src/Telemetry/ClientTelemetry.cs
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,14 @@ namespace Microsoft.Azure.Cosmos.Telemetry
using Microsoft.Azure.Cosmos.Core.Trace;
using Microsoft.Azure.Cosmos.Routing;
using Microsoft.Azure.Cosmos.Telemetry.Models;
using Microsoft.Azure.Cosmos.Tracing;
using Microsoft.Azure.Cosmos.Tracing.TraceData;
using Microsoft.Azure.Documents;
using Microsoft.Azure.Documents.Collections;
using Microsoft.Azure.Documents.Rntbd;
using Newtonsoft.Json;
using Util;
using static Microsoft.Azure.Cosmos.Tracing.TraceData.ClientSideRequestStatisticsTraceDatum;

/// <summary>
/// This class collects and send all the telemetry information.
Expand Down Expand Up @@ -53,6 +56,9 @@ internal class ClientTelemetry : IDisposable
private ConcurrentDictionary<CacheRefreshInfo, LongConcurrentHistogram> cacheRefreshInfoMap
= new ConcurrentDictionary<CacheRefreshInfo, LongConcurrentHistogram>();

private ConcurrentDictionary<RequestInfo, LongConcurrentHistogram> requestInfoMap
= new ConcurrentDictionary<RequestInfo, LongConcurrentHistogram>();

private int numberOfFailures = 0;

/// <summary>
Expand Down Expand Up @@ -182,22 +188,26 @@ private async Task EnrichAndSendAsync()
this.RecordSystemUtilization();

this.clientTelemetryInfo.DateTimeUtc = DateTime.UtcNow.ToString(ClientTelemetryOptions.DateFormat);

ConcurrentDictionary<OperationInfo, (LongConcurrentHistogram latency, LongConcurrentHistogram requestcharge)> operationInfoSnapshot
= Interlocked.Exchange(ref this.operationInfoMap, new ConcurrentDictionary<OperationInfo, (LongConcurrentHistogram latency, LongConcurrentHistogram requestcharge)>());

ConcurrentDictionary<CacheRefreshInfo, LongConcurrentHistogram> cacheRefreshInfoSnapshot
= Interlocked.Exchange(ref this.cacheRefreshInfoMap, new ConcurrentDictionary<CacheRefreshInfo, LongConcurrentHistogram>());

ConcurrentDictionary<RequestInfo, LongConcurrentHistogram> requestInfoSnapshot
= Interlocked.Exchange(ref this.requestInfoMap, new ConcurrentDictionary<RequestInfo, LongConcurrentHistogram>());

this.clientTelemetryInfo.OperationInfo = ClientTelemetryHelper.ToListWithMetricsInfo(operationInfoSnapshot);
this.clientTelemetryInfo.CacheRefreshInfo = ClientTelemetryHelper.ToListWithMetricsInfo(cacheRefreshInfoSnapshot);

this.clientTelemetryInfo.RequestInfo = ClientTelemetryHelper.ToListWithMetricsInfo(requestInfoSnapshot);

await this.SendAsync();
}
}
catch (Exception ex)
{
DefaultTrace.TraceError("Exception in EnrichAndSendAsync() : {0}", ex.Message);
DefaultTrace.TraceError("Exception in EnrichAndSendAsync() : {0}", ex);
}

DefaultTrace.TraceInformation("Telemetry Job Stopped.");
Expand All @@ -224,7 +234,7 @@ internal void CollectCacheInfo(string cacheRefreshSource,
}

DefaultTrace.TraceVerbose($"Collecting cacheRefreshSource {cacheRefreshSource} data for Telemetry.");

string regionsContacted = ClientTelemetryHelper.GetContactedRegions(regionsContactedList);

// Recording Request Latency
Expand All @@ -249,7 +259,7 @@ internal void CollectCacheInfo(string cacheRefreshSource,
}
catch (Exception ex)
{
DefaultTrace.TraceError("Latency Recording Failed by Telemetry. Exception : {0}", ex.Message);
DefaultTrace.TraceError("Latency Recording Failed by Telemetry. Exception : {0}", ex);
}
}

Expand All @@ -266,6 +276,7 @@ internal void CollectCacheInfo(string cacheRefreshSource,
/// <param name="consistencyLevel"></param>
/// <param name="requestCharge"></param>
/// <param name="subStatusCode"></param>
/// <param name="trace"></param>
internal void CollectOperationInfo(CosmosDiagnostics cosmosDiagnostics,
HttpStatusCode statusCode,
long responseSizeInBytes,
Expand All @@ -275,17 +286,22 @@ internal void CollectOperationInfo(CosmosDiagnostics cosmosDiagnostics,
ResourceType resourceType,
string consistencyLevel,
double requestCharge,
SubStatusCodes subStatusCode)
SubStatusCodes subStatusCode,
ITrace trace)
{
DefaultTrace.TraceVerbose("Collecting Operation data for Telemetry.");

if (cosmosDiagnostics == null)
{
throw new ArgumentNullException(nameof(cosmosDiagnostics));
}

string regionsContacted = ClientTelemetryHelper.GetContactedRegions(cosmosDiagnostics.GetContactedRegions());
// Record Network/Replica Information
SummaryDiagnostics summaryDiagnostics = new SummaryDiagnostics(trace);
this.RecordRntbdResponses(containerId, databaseId, summaryDiagnostics.StoreResponseStatistics.Value);

string regionsContacted = ClientTelemetryHelper.GetContactedRegions(cosmosDiagnostics.GetContactedRegions());

// Recording Request Latency and Request Charge
OperationInfo payloadKey = new OperationInfo(regionsContacted: regionsContacted?.ToString(),
responseSizeInBytes: responseSizeInBytes,
Expand All @@ -307,10 +323,10 @@ internal void CollectOperationInfo(CosmosDiagnostics cosmosDiagnostics,
try
{
latency.RecordValue(cosmosDiagnostics.GetClientElapsedTime().Ticks);
}
}
catch (Exception ex)
{
DefaultTrace.TraceError("Latency Recording Failed by Telemetry. Exception : {0}", ex.Message);
DefaultTrace.TraceError("Latency Recording Failed by Telemetry. Exception : {0}", ex);
}

long requestChargeToRecord = (long)(requestCharge * ClientTelemetryOptions.HistogramPrecisionFactor);
Expand All @@ -320,10 +336,41 @@ internal void CollectOperationInfo(CosmosDiagnostics cosmosDiagnostics,
}
catch (Exception ex)
{
DefaultTrace.TraceError("Request Charge Recording Failed by Telemetry. Request Charge Value : {0} Exception : {1} ", requestChargeToRecord, ex.Message);
DefaultTrace.TraceError("Request Charge Recording Failed by Telemetry. Request Charge Value : {0} Exception : {1} ", requestChargeToRecord, ex);
}
}

/// <summary>
/// Records RNTBD calls statistics
/// </summary>
/// <param name="containerId"></param>
/// <param name="databaseId"></param>
/// <param name="storeResponseStatistics"></param>
private void RecordRntbdResponses(string containerId, string databaseId, List<StoreResponseStatistics> storeResponseStatistics)
{
foreach (StoreResponseStatistics storetatistics in storeResponseStatistics)
{
if (ClientTelemetryOptions.IsEligible((int)storetatistics.StoreResult.StatusCode, (int)storetatistics.StoreResult.SubStatusCode, storetatistics.RequestLatency))
{
RequestInfo requestInfo = new RequestInfo()
{
DatabaseName = databaseId,
ContainerName = containerId,
Uri = storetatistics.StoreResult.StorePhysicalAddress.ToString(),
StatusCode = (int)storetatistics.StoreResult.StatusCode,
SubStatusCode = (int)storetatistics.StoreResult.SubStatusCode,
Resource = storetatistics.RequestResourceType.ToString(),
Operation = storetatistics.RequestOperationType.ToString(),
};

LongConcurrentHistogram latencyHist = this.requestInfoMap.GetOrAdd(requestInfo, x => new LongConcurrentHistogram(ClientTelemetryOptions.RequestLatencyMin,
ClientTelemetryOptions.RequestLatencyMax,
ClientTelemetryOptions.RequestLatencyPrecision));
latencyHist.RecordValue(storetatistics.RequestLatency.Ticks);
}
}
}

/// <summary>
/// Record CPU and memory usage which will be sent as part of telemetry information
/// </summary>
Expand All @@ -349,7 +396,7 @@ private void RecordSystemUtilization()
}
catch (Exception ex)
{
DefaultTrace.TraceError("System Usage Recording Error : {0}", ex.Message);
DefaultTrace.TraceError("System Usage Recording Error : {0} ", ex);
}
}

Expand Down Expand Up @@ -427,7 +474,7 @@ await this.tokenProvider.AddAuthorizationHeaderAsync(
{
this.numberOfFailures++;

DefaultTrace.TraceError("Exception while sending telemetry data : {0}", ex.Message);
DefaultTrace.TraceError("Exception while sending telemetry data : {0}", ex);
}
finally
{
Expand Down
35 changes: 30 additions & 5 deletions Microsoft.Azure.Cosmos/src/Telemetry/ClientTelemetryHelper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,12 @@ namespace Microsoft.Azure.Cosmos.Telemetry
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net.Http;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using HdrHistogram;
using Microsoft.Azure.Cosmos.Core.Trace;
using Microsoft.Azure.Cosmos.Routing;
using Microsoft.Azure.Cosmos.Telemetry.Models;
using Microsoft.Azure.Documents;
using Microsoft.Azure.Documents.Rntbd;

internal static class ClientTelemetryHelper
Expand Down Expand Up @@ -93,7 +90,7 @@ internal static List<OperationInfo> ToListWithMetricsInfo(
OperationInfo payloadForLatency = entry.Key;
payloadForLatency.MetricInfo = new MetricInfo(ClientTelemetryOptions.RequestLatencyName, ClientTelemetryOptions.RequestLatencyUnit);
payloadForLatency.SetAggregators(entry.Value.latency, ClientTelemetryOptions.TicksToMsFactor);

payloadWithMetricInformation.Add(payloadForLatency);

OperationInfo payloadForRequestCharge = payloadForLatency.Copy();
Expand All @@ -103,7 +100,35 @@ internal static List<OperationInfo> ToListWithMetricsInfo(
payloadWithMetricInformation.Add(payloadForRequestCharge);
}

DefaultTrace.TraceInformation("Aggregating operation information to list done");
DefaultTrace.TraceVerbose("Aggregating operation information to list done");

return payloadWithMetricInformation;
}

/// <summary>
/// Convert map with request information to list of operations along with request latency and request charge metrics
/// </summary>
/// <param name="metrics"></param>
/// <returns>Collection of ReportPayload</returns>
internal static List<RequestInfo> ToListWithMetricsInfo(
IDictionary<RequestInfo,
LongConcurrentHistogram> metrics)
{
DefaultTrace.TraceVerbose("Aggregating RequestInfo information to list started");

List<RequestInfo> payloadWithMetricInformation = new List<RequestInfo>();
foreach (KeyValuePair<RequestInfo, LongConcurrentHistogram> entry in metrics)
{
MetricInfo metricInfo = new MetricInfo(ClientTelemetryOptions.RequestLatencyName, ClientTelemetryOptions.RequestLatencyUnit);
metricInfo.SetAggregators(entry.Value, ClientTelemetryOptions.TicksToMsFactor);

RequestInfo payloadForLatency = entry.Key;
payloadForLatency.Metrics.Add(metricInfo);

payloadWithMetricInformation.Add(payloadForLatency);
}

DefaultTrace.TraceVerbose("Aggregating RequestInfo information to list done");

return payloadWithMetricInformation;
}
Expand Down
38 changes: 34 additions & 4 deletions Microsoft.Azure.Cosmos/src/Telemetry/ClientTelemetryOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
namespace Microsoft.Azure.Cosmos.Telemetry
{
using System;
using System.Collections.Generic;
using Microsoft.Azure.Cosmos.Core.Trace;
using Microsoft.Azure.Cosmos.Telemetry.Models;
using Microsoft.Azure.Documents;
using Newtonsoft.Json;
using Util;

internal static class ClientTelemetryOptions
{
Expand Down Expand Up @@ -79,25 +79,28 @@ internal static class ClientTelemetryOptions
internal const double Percentile99 = 99.0;
internal const double Percentile999 = 99.9;
internal const string DateFormat = "yyyy-MM-ddTHH:mm:ssZ";

internal const string EnvPropsClientTelemetrySchedulingInSeconds = "COSMOS.CLIENT_TELEMETRY_SCHEDULING_IN_SECONDS";
internal const string EnvPropsClientTelemetryEnabled = "COSMOS.CLIENT_TELEMETRY_ENABLED";
internal const string EnvPropsClientTelemetryVmMetadataUrl = "COSMOS.VM_METADATA_URL";
internal const string EnvPropsClientTelemetryEndpoint = "COSMOS.CLIENT_TELEMETRY_ENDPOINT";
internal const string EnvPropsClientTelemetryEnvironmentName = "COSMOS.ENVIRONMENT_NAME";

internal static readonly ResourceType AllowedResourceTypes = ResourceType.Document;

// Why 5 sec? As of now, if any network request is taking more than 5 millisecond sec, we will consider it slow request this value can be revisited in future
private static readonly TimeSpan NetworkLatencyThreshold = TimeSpan.FromMilliseconds(5);
internal static readonly JsonSerializerSettings JsonSerializerSettings = new JsonSerializerSettings
{
NullValueHandling = NullValueHandling.Ignore,
MaxDepth = 64, // https://github.com/advisories/GHSA-5crp-9r3c-p9vr
};

private static readonly List<int> ExcludedStatusCodes = new List<int> { 404, 409 };

private static Uri clientTelemetryEndpoint;
private static string environmentName;
private static TimeSpan scheduledTimeSpan = TimeSpan.Zero;

internal static bool IsClientTelemetryEnabled()
{
bool isTelemetryEnabled = ConfigurationManager
Expand Down Expand Up @@ -175,5 +178,32 @@ internal static string GetEnvironmentName()
}
return environmentName;
}

/// <summary>
/// This method will return true if the request is failed with User or Server Exception and not excluded from telemetry.
/// This method will return true if the request latency is more than the threshold.
/// otherwise return false
/// </summary>
/// <param name="statusCode"></param>
/// <param name="subStatusCode"></param>
/// <param name="latencyInMs"></param>
/// <returns>true/false</returns>
internal static bool IsEligible(int statusCode, int subStatusCode, TimeSpan latencyInMs)
{
return
ClientTelemetryOptions.IsStatusCodeNotExcluded(statusCode, subStatusCode) &&
(ClientTelemetryOptions.IsUserOrServerError(statusCode) || latencyInMs >= ClientTelemetryOptions.NetworkLatencyThreshold);
}

private static bool IsUserOrServerError(int statusCode)
{
return statusCode >= 400 && statusCode <= 599;
}

private static bool IsStatusCodeNotExcluded(int statusCode, int subStatusCode)
{
return !(ClientTelemetryOptions.ExcludedStatusCodes.Contains(statusCode) && subStatusCode == 0);
ealsur marked this conversation as resolved.
Show resolved Hide resolved
}

}
}
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ internal sealed class ClientTelemetryProperties
[JsonProperty(PropertyName = "operationInfo")]
internal List<OperationInfo> OperationInfo { get; set; }

[JsonProperty(PropertyName = "requestInfo")]
internal List<RequestInfo> RequestInfo { get; set; }

[JsonIgnore]
internal bool IsDirectConnectionMode { get; }

Expand Down Expand Up @@ -97,6 +100,7 @@ public ClientTelemetryProperties(string dateTimeUtc,
List<SystemInfo> systemInfo,
List<CacheRefreshInfo> cacheRefreshInfo,
List<OperationInfo> operationInfo,
List<RequestInfo> requestInfo,
string machineId)
{
this.DateTimeUtc = dateTimeUtc;
Expand All @@ -111,6 +115,7 @@ public ClientTelemetryProperties(string dateTimeUtc,
this.SystemInfo = systemInfo;
this.CacheRefreshInfo = cacheRefreshInfo;
this.OperationInfo = operationInfo;
this.RequestInfo = requestInfo;
this.PreferredRegions = preferredRegions;
this.MachineId = machineId;
}
Expand Down
Loading