Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[inputs.vsphere]: Error in plugin: ServerFaultCode: This operation is restricted by the administrator #5037

Closed
sdesbois opened this issue Nov 26, 2018 · 11 comments

Comments

@sdesbois
Copy link

sdesbois commented Nov 26, 2018

Relevant telegraf.conf:

> # # Read metrics from VMware vCenter
> [[inputs.vsphere]]
>   ## List of vCenter URLs to be monitored. These three lines must be uncommented
>   ## and edited for the plugin to work.
>   vcenters = [ "https://myvcenter1/sdk", "https://myvcenter2/sdk", "https://myvcenter3/sdk"]
>   username = "myusernamer"
>   password = "mypassword"
> 
>   interval = "20s"
> 
>   ## VMs
>   ## Typical VM metrics (if omitted or empty, all metrics are collected)
>   vm_metric_include = [
>     "cpu.ready.summation",
>     "cpu.costop.summation",
>     "cpu.usagemhz.average",
>     "cpu.usage.average",
>     "mem.active.average",
>     "mem.usage.average",
>     "net.received.average",
>     "net.transmitted.average",
>     "net.usage.average",
>     "net.packetsRx.summation",
>     "net.packetsTx.summation",
>     "virtualDisk.write.average",
>     "virtualDisk.read.average",
>     "virtualDisk.numberWriteAveraged.average",
>     "virtualDisk.numberReadAveraged.average",
>     "virtualDisk.totalWriteLatency.average",
>     "virtualDisk.totalReadLatency.average",
>     "disk.maxTotalLatency.latest",
>     "disk.usage.average",
>     "disk.read.average",
>     "disk.write.average",
>     "disk.commandsAveraged.average",
>   ]
>   # vm_metric_exclude = [] ## Nothing is excluded by default
>   # vm_instances = true ## true by default
> 
>   ## Hosts
>   ## Typical host metrics (if omitted or empty, all metrics are collected)
>   host_metric_include = [
>     "cpu.coreUtilization.average",
>     "cpu.costop.summation",
>     "cpu.demand.average",
>     "cpu.idle.summation",
>     "cpu.latency.average",
>     "cpu.readiness.average",
>     "cpu.ready.summation",
>     "cpu.swapwait.summation",
>     "cpu.usage.average",
>     "cpu.usagemhz.average",
>     "cpu.used.summation",
>     "cpu.utilization.average",
>     "cpu.wait.summation",
>     "disk.deviceReadLatency.average",
>     "disk.deviceWriteLatency.average",
>     "disk.kernelReadLatency.average",
>     "disk.kernelWriteLatency.average",
>     "disk.numberReadAveraged.average",
>     "disk.numberWriteAveraged.average",
>     "disk.read.average",
>     "disk.totalReadLatency.average",
>     "disk.totalWriteLatency.average",
>     "disk.write.average",
>     "mem.active.average",
>     "mem.latency.average",
>     "mem.state.latest",
>     "mem.swapin.average",
>     "mem.swapinRate.average",
>     "mem.swapout.average",
>     "mem.swapoutRate.average",
>     "mem.totalCapacity.average",
>     "mem.usage.average",
>     "mem.vmmemctl.average",
>     "net.bytesRx.average",
>     "net.bytesTx.average",
>     "net.droppedRx.summation",
>     "net.droppedTx.summation",
>     "net.errorsRx.summation",
>     "net.errorsTx.summation",
>     "net.usage.average",
>     "storageAdapter.numberReadAveraged.average",
>     "storageAdapter.numberWriteAveraged.average",
>     "storageAdapter.read.average",
>     "storageAdapter.write.average",
>     "sys.uptime.latest",
>   ]
>   # host_metric_exclude = [] ## Nothing excluded by default
>   # host_instances = true ## true by default
> 
>   ## Clusters
>   # cluster_metric_include = [] ## if omitted or empty, all metrics are collected
>   # cluster_metric_exclude = [] ## Nothing excluded by default
>   # cluster_instances = true ## true by default
> 
>   ## Datastores
> #  datastore_metric_include = [
> #  "datastore.numberReadAveraged.average",
> #  "datastore.numberWriteAveraged.average",
> #  "datastore.throughput.contention.average",
> #  "datastore.throughput.usage.average",
> #  "datastore.write.average",
> #  "datastore.read.average",
> #  "disk.used.latest",
> #  "disk.provisioned.latest",
> #  "disk.capacity.latest",
> #  "disk.capacity.contention.average",
> #  "disk.capacity.provisioned.average",
> #  "disk.capacity.usage.average",
> #  ] ## if omitted or empty, all metrics are collected
>   # datastore_metric_exclude = [] ## Nothing excluded by default
> #  datastore_instances = false ## false by default for Datastores only
> 
>   ## Datacenters
>   datacenter_metric_include = [] ## if omitted or empty, all metrics are collected
>   datacenter_metric_exclude = [ "*" ] ## Datacenters are not collected by default.
>   # datacenter_instances = false ## false by default for Datastores only
> 
>   ## Plugin Settings
>   ## separator character to use for measurement and field names (default: "_")
>   # separator = "_"
> 
>   ## number of objects to retreive per query for realtime resources (vms and hosts)
>   ## set to 64 for vCenter 5.5 and 6.0 (default: 256)
>   max_query_objects = 32
> 
>   ## number of metrics to retreive per query for non-realtime resources (clusters and datastores)
>   ## set to 64 for vCenter 5.5 and 6.0 (default: 256)
>   max_query_metrics = 32
> 
>   ## number of go routines to use for collection and discovery of objects and metrics
>   # collect_concurrency = 1
>   # discover_concurrency = 1
> 
>   ## whether or not to force discovery of new objects on initial gather call before collecting metrics
>   ## when true for large environments this may cause errors for time elapsed while collecting metrics
>   ## when false (default) the first collection cycle may result in no or limited metrics while objects are discovered
>   # force_discover_on_init = false
> 
>   ## the interval before (re)discovering objects subject to metrics collection (default: 300s)
>   # object_discovery_interval = "300s"
> 
>   ## timeout applies to any of the api request made to vcenter
>   timeout = "10s"
> 
>   ## Optional SSL Config
>   # ssl_ca = "/path/to/cafile"
>   # ssl_cert = "/path/to/certfile"
>   # ssl_key = "/path/to/keyfile"
>   ## Use SSL but skip chain & host verification
>   insecure_skip_verify = true

System info:

Lastest telegraf version (1.9.0)
Centos 7.5
vSphere 6.0 update 3

Steps to reproduce:

  1. Start telegraf
  2. ...

Expected behavior:

No error

Actual behavior:

2018-11-26T14:56:21Z E! [inputs.vsphere]: Error in plugin: ServerFaultCode: This operation is restricted by the administrator - 'vpxd.stats.maxQueryMetrics'. Contact your system administrator.

Additional info:

Account used to access vSphere is read-only.
In the previous versions of Telegraf, no error.

@prydin
Copy link
Contributor

prydin commented Nov 26, 2018

Are you using the cluster metrics? A quick fix is to turn off cluster metrics. There seems to be a bug in some versions of vCenter restricting cluster queries that shouldn't be restricted.

Try to add this:

cluster_metric_exlcude = ["*"]

@sdesbois
Copy link
Author

Indeed, this fix the issue.
But I plan to use the cluster stats at some point.

@prydin
Copy link
Contributor

prydin commented Nov 26, 2018

You can synthesize most of them using aggregating queries across hosts based on the "clustername" tag.

That being said, we are actively looking at options for fixing this.

@prydin
Copy link
Contributor

prydin commented Nov 26, 2018

To help us debugging this, could you tell me what version and build you have of vCenter?

@sdesbois
Copy link
Author

vCenter Server 6.0 U3h - build 9313458

@AndySilvia
Copy link

Same here. Running with telegraf 1.10.4-1 (from yum repository) and vCenter 6.7.0 Build 13007421 ...

@Mjolinir
Copy link

This occurs for me with vCenter 6.7 U3 and telegraf 1.12.2 on RHEL7

@prydin
Copy link
Contributor

prydin commented Sep 27, 2019

Are you collecting a lot of cluster metrics? There's a bug in vCenter that causes issues when querying cluster metrics. In essence, it over-counts the number of objects queried and refuses due to perceived query complexity. The only way to get around this seems to be to increase vpxd.stats.maxQueryMetrics to a much higher value or setting it to -1, meaning that query sizes are unrestricted.

@prydin
Copy link
Contributor

prydin commented Sep 27, 2019

Depending on the time series database you're using, you could work around this by simply not querying the cluster metrics, but synthesizing them from host metrics. This also has the advantage that you can use metrics collected at a 20s interval, rather than the minimum 5m for cluster metrics. I think most advanced TSDBs have aggregating queries that can e.g. present the sum of CPU utilizations for all hosts in a cluster. In fact, that's how the cluster dashboards in Wavefront are built and I'm pretty sure InfluxDB allows you to do the same. Just something to consider.

@sspaink
Copy link
Contributor

sspaink commented Jun 27, 2022

@sdesbois @AndySilvia are you still having this problem?

@sspaink sspaink added the waiting for response waiting for response from contributor label Jun 27, 2022
@sdesbois
Copy link
Author

Very old subject.
Don't have this issue anymore.

@telegraf-tiger telegraf-tiger bot removed the waiting for response waiting for response from contributor label Jun 29, 2022
@sspaink sspaink closed this as completed Jun 29, 2022
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

6 participants