Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

inputs.vsphere, reports duplicate points #11168

Closed
yogeshprasad opened this issue May 23, 2022 · 30 comments · Fixed by #12259
Closed

inputs.vsphere, reports duplicate points #11168

yogeshprasad opened this issue May 23, 2022 · 30 comments · Fixed by #12259
Labels
bug unexpected problem or unintended behavior

Comments

@yogeshprasad
Copy link

Relevant telegraf.conf

[agent]
  ## Default data collection interval for all inputs
  interval = "60s"

# Read metrics from one or many vCenters
[[inputs.vsphere]]
    ## List of vCenter URLs to be monitored. These three lines must be uncommented
  ## and edited for the plugin to work.
  vcenters = [ "https://vcenter.local/sdk" ]
  username = "[email protected]"
  password = "secret"

  ## VMs
  ## Typical VM metrics (if omitted or empty, all metrics are collected)
  # vm_include = [ "/*/vm/**"] # Inventory path to VMs to collect (by default all are collected)
  # vm_exclude = [] # Inventory paths to exclude
  vm_metric_include = [
    "cpu.demand.average",
    "cpu.idle.summation",
    "cpu.latency.average",
    "cpu.readiness.average",
    "cpu.ready.summation",
    "cpu.run.summation",
    "cpu.usagemhz.average",
    "cpu.used.summation",
    "cpu.wait.summation",
    "mem.active.average",
    "mem.granted.average",
    "mem.latency.average",
    "mem.swapin.average",
    "mem.swapinRate.average",
    "mem.swapout.average",
    "mem.swapoutRate.average",
    "mem.usage.average",
    "mem.vmmemctl.average",
    "net.bytesRx.average",
    "net.bytesTx.average",
    "net.droppedRx.summation",
    "net.droppedTx.summation",
    "net.usage.average",
    "power.power.average",
    "virtualDisk.numberReadAveraged.average",
    "virtualDisk.numberWriteAveraged.average",
    "virtualDisk.read.average",
    "virtualDisk.readOIO.latest",
    "virtualDisk.throughput.usage.average",
    "virtualDisk.totalReadLatency.average",
    "virtualDisk.totalWriteLatency.average",
    "virtualDisk.write.average",
    "virtualDisk.writeOIO.latest",
    "sys.uptime.latest",
  ]
  # vm_metric_exclude = [] ## Nothing is excluded by default
  # vm_instances = true ## true by default

  ## Hosts
  ## Typical host metrics (if omitted or empty, all metrics are collected)
  # host_include = [ "/*/host/**"] # Inventory path to hosts to collect (by default all are collected)
  # host_exclude [] # Inventory paths to exclude
  host_metric_include = [
    "cpu.coreUtilization.average",
    "cpu.costop.summation",
    "cpu.demand.average",
    "cpu.idle.summation",
    "cpu.latency.average",
    "cpu.readiness.average",
    "cpu.ready.summation",
    "cpu.swapwait.summation",
    "cpu.usage.average",
    "cpu.usagemhz.average",
    "cpu.used.summation",
    "cpu.utilization.average",
    "cpu.wait.summation",
    "disk.deviceReadLatency.average",
    "disk.deviceWriteLatency.average",
    "disk.kernelReadLatency.average",
    "disk.kernelWriteLatency.average",
    "disk.numberReadAveraged.average",
    "disk.numberWriteAveraged.average",
    "disk.read.average",
    "disk.totalReadLatency.average",
    "disk.totalWriteLatency.average",
    "disk.write.average",
    "mem.active.average",
    "mem.latency.average",
    "mem.state.latest",
    "mem.swapin.average",
    "mem.swapinRate.average",
    "mem.swapout.average",
    "mem.swapoutRate.average",
    "mem.totalCapacity.average",
    "mem.usage.average",
    "mem.vmmemctl.average",
    "net.bytesRx.average",
    "net.bytesTx.average",
    "net.droppedRx.summation",
    "net.droppedTx.summation",
    "net.errorsRx.summation",
    "net.errorsTx.summation",
    "net.usage.average",
    "power.power.average",
    "storageAdapter.numberReadAveraged.average",
    "storageAdapter.numberWriteAveraged.average",
    "storageAdapter.read.average",
    "storageAdapter.write.average",
    "sys.uptime.latest",
  ]
    ## Collect IP addresses? Valid values are "ipv4" and "ipv6"
  # ip_addresses = ["ipv6", "ipv4" ]

  # host_metric_exclude = [] ## Nothing excluded by default
  # host_instances = true ## true by default


  ## Clusters
  # cluster_include = [ "/*/host/**"] # Inventory path to clusters to collect (by default all are collected)
  # cluster_exclude = [] # Inventory paths to exclude
  # cluster_metric_include = [] ## if omitted or empty, all metrics are collected
  # cluster_metric_exclude = [] ## Nothing excluded by default
  # cluster_instances = false ## false by default

  ## Resource Pools
  # datastore_include = [ "/*/host/**"] # Inventory path to datastores to collect (by default all are collected)
  # datastore_exclude = [] # Inventory paths to exclude
  # datastore_metric_include = [] ## if omitted or empty, all metrics are collected
  # datastore_metric_exclude = [] ## Nothing excluded by default
  # datastore_instances = false ## false by default

    ## Datastores
  # datastore_include = [ "/*/datastore/**"] # Inventory path to datastores to collect (by default all are collected)
  # datastore_exclude = [] # Inventory paths to exclude
  # datastore_metric_include = [] ## if omitted or empty, all metrics are collected
  # datastore_metric_exclude = [] ## Nothing excluded by default
  # datastore_instances = false ## false by default

  ## Datacenters
  # datacenter_include = [ "/*/host/**"] # Inventory path to clusters to collect (by default all are collected)
  # datacenter_exclude = [] # Inventory paths to exclude
  datacenter_metric_include = [] ## if omitted or empty, all metrics are collected
  datacenter_metric_exclude = [ "*" ] ## Datacenters are not collected by default.
  # datacenter_instances = false ## false by default

  ## Plugin Settings
  ## separator character to use for measurement and field names (default: "_")
  # separator = "_"

  ## number of objects to retrieve per query for realtime resources (vms and hosts)
  ## set to 64 for vCenter 5.5 and 6.0 (default: 256)
  # max_query_objects = 256

  ## number of metrics to retrieve per query for non-realtime resources (clusters and datastores)
  ## set to 64 for vCenter 5.5 and 6.0 (default: 256)
  # max_query_metrics = 256

  ## number of go routines to use for collection and discovery of objects and metrics
  # collect_concurrency = 1
  # discover_concurrency = 1

  ## the interval before (re)discovering objects subject to metrics collection (default: 300s)
  # object_discovery_interval = "300s"

  ## timeout applies to any of the api request made to vcenter
  # timeout = "60s"

  ## When set to true, all samples are sent as integers. This makes the output
  ## data types backwards compatible with Telegraf 1.9 or lower. Normally all
  ## samples from vCenter, with the exception of percentages, are integer
  ## values, but under some conditions, some averaging takes place internally in
  ## the plugin. Setting this flag to "false" will send values as floats to
  ## preserve the full precision when averaging takes place.
  # use_int_samples = true

  ## The number of vSphere 5 minute metric collection cycles to look back for non-realtime metrics. In 
  ## some versions (6.7, 7.0 and possible more), certain metrics, such as cluster metrics, may be reported
  ## with a significant delay (>30min). If this happens, try increasing this number. Please note that increasing
  ## it too much may cause performance issues.
  # metric_lookback = 3

  ## Custom attributes from vCenter can be very useful for queries in order to slice the
  ## metrics along different dimension and for forming ad-hoc relationships. They are disabled
  ## by default, since they can add a considerable amount of tags to the resulting metrics. To
  ## enable, simply set custom_attribute_exclude to [] (empty set) and use custom_attribute_include
  ## to select the attributes you want to include.
  ## By default, since they can add a considerable amount of tags to the resulting metrics. To
  ## enable, simply set custom_attribute_exclude to [] (empty set) and use custom_attribute_include
  ## to select the attributes you want to include.
  # custom_attribute_include = []
  # custom_attribute_exclude = ["*"]

  ## Optional SSL Config
  # ssl_ca = "/path/to/cafile"
  # ssl_cert = "/path/to/certfile"
  # ssl_key = "/path/to/keyfile"
  ## Use SSL but skip chain & host verification
  # insecure_skip_verify = false

Logs from Telegraf

No relevant logs

System info

Telegraf v1.22.4

Docker

No response

Steps to reproduce

  1. Set interval = "60s" in agent
  2. Enable the Vsphere input plugin with default settings
  3. Run the telegraf for 5 min

Expected behavior

There should not be any duplicate points

Actual behavior

Nearly 10 % of points are being duplicate

Additional info

This is the function which creates duplicate points as we are adjusting the time here.
https://github.com/influxdata/telegraf/blob/master/plugins/inputs/vsphere/endpoint.go#L1104

@yogeshprasad yogeshprasad added the bug unexpected problem or unintended behavior label May 23, 2022
@powersj
Copy link
Contributor

powersj commented May 24, 2022

Hi,

Would you be willing to share some of the example duplicate points from a telegraf log? It would be helpful to use the [[outputs.file]] output to see what metrics you are getting and if they do show up multiple times.

Thanks!

@powersj powersj added the waiting for response waiting for response from contributor label May 24, 2022
@yogeshprasad
Copy link
Author

Hi @powersj
Here is the file metrics.out.zip, it has data for more than 10 min and nearly 25% of data is duplicated.
Some of the metrics:

"vsphere.vm.mem.usage.average" source="skajagar-win2008" 1653477360000 6.99 "esxhostname"="gty-a003.test.com" "dcname"="WF_BGLR_INT_DC" "clustername"="WF_BGLR_INT_Cluster01" "_wavefront_source"="proxy::abcd-efgh-241-239.test.com" "vcenter"="int-vc.test.com" "guest"="winLonghorn64" "uuid"="420b5c3c-6f1b-d231-314b-23b88901cd98" "vmname"="skajagar-win2008" "moid"="vm-129" "guesthostname"="WIN-PR69FI29KLE"
--->"vsphere.vm.mem.usage.average" source="skajagar-win2008" 1653477360000 6.99 "esxhostname"="gty-a003.test.com" "dcname"="WF_BGLR_INT_DC" "clustername"="WF_BGLR_INT_Cluster01" "_wavefront_source"="proxy::abcd-efgh-241-239.test.com" "vcenter"="int-vc.test.com" "guest"="winLonghorn64" "vmname"="skajagar-win2008" "uuid"="420b5c3c-6f1b-d231-314b-23b88901cd98" "moid"="vm-129" "guesthostname"="WIN-PR69FI29KLE"

"vsphere.host.cpu.used.summation" source="gty-a003.test.com" 1653477360000 43.0 "esxhostname"="gty-a003.test.com" "dcname"="WF_BGLR_INT_DC" "clustername"="WF_BGLR_INT_Cluster01" "_wavefront_source"="proxy::abcd-efgh-241-239.test.com" "vcenter"="int-vc.test.com" "cpu"="81" "moid"="host-17"
--->"vsphere.host.cpu.used.summation" source="gty-a003.test.com" 1653477360000 48.0 "esxhostname"="gty-a003.test.com" "dcname"="WF_BGLR_INT_DC" "clustername"="WF_BGLR_INT_Cluster01" "_wavefront_source"="proxy::abcd-efgh-241-239.test.com" "vcenter"="int-vc.test.com" "cpu"="81" "moid"="host-17"

"vsphere.host.cpu.coreUtilization.average" source="gty-a003.test.com" 1653477360000 1.04 "esxhostname"="gty-a003.test.com" "dcname"="WF_BGLR_INT_DC" "clustername"="WF_BGLR_INT_Cluster01" "_wavefront_source"="proxy::abcd-efgh-241-239.test.com" "vcenter"="int-vc.test.com" "cpu"="31" "moid"="host-17"
--->"vsphere.host.cpu.coreUtilization.average" source="gty-a003.test.com" 1653477360000 0.98 "esxhostname"="gty-a003.test.com" "dcname"="WF_BGLR_INT_DC" "clustername"="WF_BGLR_INT_Cluster01" "_wavefront_source"="proxy::abcd-efgh-241-239.test.com" "vcenter"="int-vc.test.com" "cpu"="31" "moid"="host-17"

@telegraf-tiger telegraf-tiger bot removed the waiting for response waiting for response from contributor label May 26, 2022
@powersj
Copy link
Contributor

powersj commented May 27, 2022

What is your output you are pushing this to? And what is the output in your comment above from?

When looking at the metrics.out file I first looked at the metrics at the timestamp you first referenced, 1653477360000 and looked for "winLonghorn64". There are some very similar lines, but they are different.

For example:

- vsphere_vm_power,clustername=WF_BGLR_INT_Cluster01,dcname=WF_BGLR_INT_DC,esxhostname=gty-a003.test.com,guest=winLonghorn64,guesthostname=WIN-PR69FI29KLE,host=telegraf_rhel_nossl,moid=vm-129,source=skajagar-win2008,uuid=420b5c3c-6f1b-d231-314b-23b88901cd98,vcenter=int-vc.test.com,vmname=skajagar-win2008 power_average=244i 1653477360000000000
+ vsphere_vm_power,clustername=WF_BGLR_INT_Cluster01,dcname=WF_BGLR_INT_DC,esxhostname=gty-a003.test.com,guest=winLonghorn64,guesthostname=WIN-PR69FI29KLE,host=telegraf_rhel_nossl,moid=vm-129,source=skajagar-win2008,uuid=420b5c3c-6f1b-d231-314b-23b88901cd98,vcenter=int-vc.test.com,vmname=skajagar-win2008 power_average=247i 1653477360000000000

In the above, the tags are all the same, but the field value reported for power_average is different between the two.

A similar example looking at the memory output:

- vsphere_vm_mem,clustername=WF_BGLR_INT_Cluster01,dcname=WF_BGLR_INT_DC,esxhostname=gty-a003.test.com,guest=winLonghorn64,guesthostname=WIN-FCCHP2L47JJ,host=telegraf_rhel_nossl,moid=vm-131,source=skajagar-win2016-Activated,uuid=420bc4e3-d966-403e-f6f7-7bb6223b6fae,vcenter=int-vc.test.com,vmname=skajagar-win2016-Activated swapinRate_average=0i,active_average=419429i,swapout_average=0i,swapin_average=0i,swapoutRate_average=0i,latency_average=0,vmmemctl_average=0i,usage_average=9.99,granted_average=4194304i 1653477360000000000
+ vsphere_vm_mem,clustername=WF_BGLR_INT_Cluster01,dcname=WF_BGLR_INT_DC,esxhostname=gty-a003.test.com,guest=winLonghorn64,guesthostname=WIN-FCCHP2L47JJ,host=telegraf_rhel_nossl,moid=vm-131,source=skajagar-win2016-Activated,uuid=420bc4e3-d966-403e-f6f7-7bb6223b6fae,vcenter=int-vc.test.com,vmname=skajagar-win2016-Activated active_average=398458i,granted_average=4194304i,swapout_average=0i,swapin_average=0i,swapoutRate_average=0i,swapinRate_average=0i,latency_average=0,vmmemctl_average=0i,usage_average=9.49 1653477360000000000

Here the active_average is different as well.

So I would not say that these are duplicates. What I do wonder about is why the metric is showing up twice with the same timestamp.

Can you please provide the rest of your configuration?

@powersj powersj added the waiting for response waiting for response from contributor label May 27, 2022
@yogeshprasad
Copy link
Author

@powersj Thanks for the analysis.
as you mentioned for the below example everything is same except the value and that's the problem exactly. How it is possible for a resource to have different values at the same time, and because of this we are not able to conclude which value is the correct one.

- vsphere_vm_power,clustername=WF_BGLR_INT_Cluster01,dcname=WF_BGLR_INT_DC,esxhostname=gty-a003.test.com,guest=winLonghorn64,guesthostname=WIN-PR69FI29KLE,host=telegraf_rhel_nossl,moid=vm-129,source=skajagar-win2008,uuid=420b5c3c-6f1b-d231-314b-23b88901cd98,vcenter=int-vc.test.com,vmname=skajagar-win2008 power_average=244i 1653477360000000000
+ vsphere_vm_power,clustername=WF_BGLR_INT_Cluster01,dcname=WF_BGLR_INT_DC,esxhostname=gty-a003.test.com,guest=winLonghorn64,guesthostname=WIN-PR69FI29KLE,host=telegraf_rhel_nossl,moid=vm-129,source=skajagar-win2008,uuid=420b5c3c-6f1b-d231-314b-23b88901cd98,vcenter=int-vc.test.com,vmname=skajagar-win2008 power_average=247i 1653477360000000000

@telegraf-tiger telegraf-tiger bot removed the waiting for response waiting for response from contributor label May 28, 2022
@powersj
Copy link
Contributor

powersj commented Jun 1, 2022

As I don't think the plugin would suddenly start generating duplicate metrics, it looks like you have two plugins running at the same time.

Some follow up questions:

  • How are you running Telegraf?
  • How many configuration files do you have?
  • Can you provide your entire configuration file(s) with secrets removed?
  • Do you have multiple vsphere input plugins configured? Can you provide the start of the logs which shows what plugins are loaded?

If you add the following to your config, does the output change?

[[inputs.vsphere]]
  name_override = "vsphere_local"
  vcenters = [ "https://vcenter.local/sdk" ]

I would expect to see a single metric called "vsphere_local".

@powersj powersj added the waiting for response waiting for response from contributor label Jun 1, 2022
@yogeshprasad
Copy link
Author

  • Running Telegraf as a Service
2022-06-02T13:17:30Z I! Starting Telegraf 1.22.3
2022-06-02T13:17:30Z I! Loaded inputs: vsphere
2022-06-02T13:17:30Z I! Loaded aggregators:
2022-06-02T13:17:30Z I! Loaded processors:
2022-06-02T13:17:30Z I! Loaded outputs: file wavefront
2022-06-02T13:17:30Z I! Tags enabled: host=sk-tel-centos-8
2022-06-02T13:17:30Z W! Deprecated inputs: 0 and 1 options
2022-06-02T13:17:30Z I! [agent] Config: Interval:1m0s, Quiet:false, Hostname:"sk-tel-centos-8", Flush Interval:10s
2022-06-02T13:17:30Z I! [inputs.vsphere] Starting plugin
2022-06-02T13:17:31Z I! connected to Wavefront proxy at address: 19.201.21.22:2878
  • How many configuration files do you have?

    1. telegraf.conf
    2. vsphere.conf
    3. files.conf
    4. wavefront.conf
  • Do you have multiple vsphere input plugins configured? Can you provide the start of the logs which shows what plugins are loaded?

No we have only one vsphere plugin enabled

  • if you add the following to your config, does the output change?

yes all metrics name got replaced with vsphere_local

@telegraf-tiger telegraf-tiger bot removed the waiting for response waiting for response from contributor label Jun 2, 2022
@powersj
Copy link
Contributor

powersj commented Jun 2, 2022

  • What version of vsphere are you running?
  • If you exclude all the vm_metrics with vm_metric_exclude = [ "*" ] and delete the vm_metric_include array, do you still get duplicates?

@powersj powersj added the waiting for response waiting for response from contributor label Jun 2, 2022
@yogeshprasad
Copy link
Author

What version of vsphere are you running?

vSphere Client version 7.0.1.00200

If you exclude all the vm_metrics with vm_metric_exclude = [ "*" ] and delete the vm_metric_include array, do you still get duplicates?

We are getting duplicates for all kinds of metrics.

I am suspecting these two places as we are adjusting the time here:
https://github.com/influxdata/telegraf/blob/master/plugins/inputs/vsphere/endpoint.go#L1104
https://github.com/influxdata/telegraf/blob/master/plugins/inputs/vsphere/endpoint.go#L936

@telegraf-tiger telegraf-tiger bot removed the waiting for response waiting for response from contributor label Jun 3, 2022
@powersj
Copy link
Contributor

powersj commented Jun 3, 2022

We are getting duplicates for all kinds of metrics.

Does that mean you still are getting duplicates with excluding VM metrics?

@yogeshprasad
Copy link
Author

I disabled the VM metric as you mentioned and captured the result for 7 minutes and found nearly 33% of duplicate

Total Points: 30113
Duplicate points: 10008
Duplicate %: 33.2348155281772

@powersj
Copy link
Contributor

powersj commented Jun 10, 2022

Can you do one more thing, run with --debug and get me the full logs. The plugin appears to have quite a few debug statements and I'd like to follow along with what is happening.

@powersj powersj added the waiting for response waiting for response from contributor label Jun 10, 2022
@yogeshprasad
Copy link
Author

Please find the log file with debug enabled
out.log

@telegraf-tiger telegraf-tiger bot removed the waiting for response waiting for response from contributor label Jun 13, 2022
@yogeshprasad
Copy link
Author

@powersj are we looking on it?

@powersj
Copy link
Contributor

powersj commented Jun 22, 2022

It's on my list, but not something I've gotten back around to. I have only briefly looked at the log you provided, and I do believe the next step is to add a bit more logging to see where duplicates are getting created.

@yogeshprasad
Copy link
Author

if you need any help please let me know I can add extra logs wherever you suggest and provide you with the output.

@yogeshprasad
Copy link
Author

@powersj any idea on when are we prioritizing this?
I have to communicate the same with the customers.

@powersj
Copy link
Contributor

powersj commented Jun 30, 2022

Thanks again for the logs.

It would be really nice to isolate it down to a metric that we know is duplicated. For example, we identified the power.power.average metric as a duplicate. Can you run and collect that metric for only a VM and exclude all other metrics, with debugging still enabled. I believe this config, with the correct address and credentials, would do this:

[agent]
  interval = "60s"
  debug = true
  
[[inputs.vsphere]]
  vcenters = [ "https://vcenter.local/sdk" ]
  username = "[email protected]"
  password = "secret"

  vm_metric_include = ["power.power.average"]
  
  host_metric_exclude = ["*"]
  cluster_metric_exclude = ["*"]
  datastore_metric_exclude = ["*"]
  datacenter_metric_exclude = ["*"]
  resourcepool_metric_exclude = ["*"]

Then look at the data and let me know if you still see the duplicates. If so, please include the debug log.

If that does not produce duplicates, then I would make the following change to the config:

--- host_metric_exclude = ["*"]
+++ host_metric_exclude = ["power.power.average"]

Based on the inventory path example, it does look like there are multiple ways to reference a VM. Either via the host folder path or the VM path. I have not looked deeper into this, but that seems like an obvious place where duplicate metrics could be showing up.

@yogeshprasad
Copy link
Author

@powersj
Thanks for having a look into it. I see other metrics also as duplicates. for me, the below line looks suspicious
https://github.com/influxdata/telegraf/blob/master/plugins/inputs/vsphere/endpoint.go#L1121

as we know real-time metrics are available at 20 second granularity. and let's say we have a one minute of refresh rate and the last time when we reported the point is 10:30 with timestamp of {10:30:00, 10:31:00} so the next refresh will happen at 10:31 and for real time metrics we will get 3 points as they have 20 seconds granularity. let say those 3 points have timestamp of {10:30:20, 10:30:40, 10:31:00} basically after Truncate this will get converted to two points with timestamp {10:30:00, 10:31:00} and in this case first point will becomea duplicate as we already reported with that timestamp in the last refresh
below is the output after adding extra log:

2022-07-01T14:41:06Z D! [inputs.vsphere] Before Align timestamp: [{{} 2022-07-01 14:40:20 +0000 UTC 20} {{} 2022-07-01 14:40:40 +0000 UTC 20} {{} 2022-07-01 14:41:00 +0000 UTC 20}] values: [2 17 1]
2022-07-01T14:41:06Z D! [inputs.vsphere] After Align timestamp: [{{} 2022-07-01 14:40:00 +0000 UTC 20} {{} 2022-07-01 14:41:00 +0000 UTC 20}] values: [9.5 1]
2022-07-01T14:41:06Z D! [inputs.vsphere] Before Align timestamp: [{{} 2022-07-01 14:40:20 +0000 UTC 20} {{} 2022-07-01 14:40:40 +0000 UTC 20} {{} 2022-07-01 14:41:00 +0000 UTC 20}] values: [0 0 0]
2022-07-01T14:41:06Z D! [inputs.vsphere] After Align timestamp: [{{} 2022-07-01 14:40:00 +0000 UTC 20} {{} 2022-07-01 14:41:00 +0000 UTC 20}] values: [0 0]
2022-07-01T14:41:06Z D! [inputs.vsphere] Before Align timestamp: [{{} 2022-07-01 14:40:20 +0000 UTC 20} {{} 2022-07-01 14:40:40 +0000 UTC 20} {{} 2022-07-01 14:41:00 +0000 UTC 20}] values: [0 0 0]
2022-07-01T14:41:06Z D! [inputs.vsphere] After Align timestamp: [{{} 2022-07-01 14:40:00 +0000 UTC 20} {{} 2022-07-01 14:41:00 +0000 UTC 20}] values: [0 0]
2022-07-01T14:41:06Z D! [inputs.vsphere] Before Align timestamp: [{{} 2022-07-01 14:40:20 +0000 UTC 20} {{} 2022-07-01 14:40:40 +0000 UTC 20} {{} 2022-07-01 14:41:00 +0000 UTC 20}] values: [68 201 98]
2022-07-01T14:41:06Z D! [inputs.vsphere] After Align timestamp: [{{} 2022-07-01 14:40:00 +0000 UTC 20} {{} 2022-07-01 14:41:00 +0000 UTC 20}] values: [134.5 98]

I did some experiments by removing the alignSamples function and found no duplicate and with the alignSamples function it gives 25% of duplicate.

@yogeshprasad
Copy link
Author

@powersj does this analysis make any sense or am I missing something here?

@powersj
Copy link
Contributor

powersj commented Jul 7, 2022

as we know real-time metrics are available at 20 second granularity

We do not all know this, at least I did not :) Telegraf has hundreds of plugins connecting to various services and software, but none of the maintainers currently know all of the plugins on a deep level. I do see the discussion about this in realtime vs historical in the README, but it was not clear to me that it could be a culprit yet.

As such, I was still really hoping to at least see the logs with a single metric included to hopefully learn more about how a single metric makes its way through the plugin and see how the vsphere interval was set and if any padding was occurring.

basically after Truncate this will get converted to two points with timestamp {10:30:00, 10:31:00} and in this case first point will becomea duplicate as we already reported with that timestamp in the last refresh

Said another way, your hypothesis is that at Telegraf's flush interval N+1 we produce a metric that is identical to a metric that was in flush interval N due to the alignSamples function's? It is not clear to me why this function was even added in #5113 and what problem it solves?

@prydin do you have details or help you could provide on why a user might be seeing duplicate metrics come out of the alignSamples function?

@yogeshprasad
Copy link
Author

@powersj as we are not getting a response from @prydin can we make alignSamples configurable?

@prydin
Copy link
Contributor

prydin commented Aug 3, 2022

Sorry for the delay. Let me have a look at alignSamples. The idea was to avoid duplicates, not create them, so something is clearly amiss there.

@yogeshprasad
Copy link
Author

Hi @prydin did you get a chance to look into it?

@powersj
Copy link
Contributor

powersj commented Oct 12, 2022

Hi,

Wanted to check in and see if you both were able to resolve the issue?

@powersj powersj added the waiting for response waiting for response from contributor label Oct 12, 2022
@prydin
Copy link
Contributor

prydin commented Oct 13, 2022

@powersj I'm trying to reproduce this in my lab right now. I'll get back to you once I have an idea what's going on.

@telegraf-tiger telegraf-tiger bot removed the waiting for response waiting for response from contributor label Oct 13, 2022
@powersj
Copy link
Contributor

powersj commented Oct 13, 2022

Thanks!

@powersj powersj added the waiting for response waiting for response from contributor label Oct 13, 2022
@telegraf-tiger
Copy link
Contributor

Hello! I am closing this issue due to inactivity. I hope you were able to resolve your problem, if not please try posting this question in our Community Slack or Community Page. Thank you!

@prydin
Copy link
Contributor

prydin commented Oct 28, 2022

Not sure why this go auto cloused. @powersj I think I might have found a workaround. @powersj have you tried adding metric_lookback = 0 to your config file? This seems to solve the problem. Make sure you're not seeing any gaps in the data.

@telegraf-tiger telegraf-tiger bot removed the waiting for response waiting for response from contributor label Oct 28, 2022
@powersj
Copy link
Contributor

powersj commented Oct 28, 2022

@yogeshprasad can you try the above suggestion?

@powersj powersj added the waiting for response waiting for response from contributor label Oct 28, 2022
@powersj powersj reopened this Nov 17, 2022
@powersj
Copy link
Contributor

powersj commented Nov 17, 2022

@yogeshprasad not sure if you have already, but can you try the PR in #12259?

@telegraf-tiger telegraf-tiger bot removed the waiting for response waiting for response from contributor label Nov 17, 2022
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug unexpected problem or unintended behavior
Projects
None yet
Development

Successfully merging a pull request may close this issue.

3 participants