Apache Mesos Metrics

Ship your Apache Mesos Metrics via Telegraf to your Logit.io Stack

Configure Telegraf to ship Apache Mesos metrics to your Logit.io stacks via Logstash.

Install Integration

Please click on the Install Integration button to configure your stack for this source.

Install Telegraf

This integration allows you to configure a Telegraf agent to send your metrics, in multiple formats, to Logit.io.

Choose the installation method for your operating system:

When you paste the command below into Powershell it will download the Telegraf zip file. Once that is complete, press Enter again and the zip file will be extracted into C:\Program Files\InfluxData\telegraf\telegraf-1.31.2.

wget https://dl.influxdata.com/telegraf/releases/telegraf-1.31.2_windows_amd64.zip -UseBasicParsing -OutFile telegraf-1.31.2_windows_amd64.zip 
Expand-Archive .\telegraf-1.31.2_windows_amd64.zip -DestinationPath 'C:\Program Files\InfluxData\telegraf'

Configure the Telegraf input plugin

The configuration file below is pre-configured to scrape the system metrics from your hosts, add the following code to the configuration file /etc/telegraf/telegraf.conf from the previous step.

-# Read metrics from one or many vCenters
[[inputs.vsphere]]
  ## List of vCenter URLs to be monitored. These three lines must be uncommented
  ## and edited for the plugin to work.
  vcenters = [ "https://vcenter.local/sdk" ]
  username = "[email protected]"
  password = "secret"
 
  ## VMs
  ## Typical VM metrics (if omitted or empty, all metrics are collected)
  # vm_include = [ "/*/vm/**"] # Inventory path to VMs to collect (by default all are collected)
  # vm_exclude = [] # Inventory paths to exclude
  vm_metric_include = [
    "cpu.demand.average",
    "cpu.idle.summation",
    "cpu.latency.average",
    "cpu.readiness.average",
    "cpu.ready.summation",
    "cpu.run.summation",
    "cpu.usagemhz.average",
    "cpu.used.summation",
    "cpu.wait.summation",
    "mem.active.average",
    "mem.granted.average",
    "mem.latency.average",
    "mem.swapin.average",
    "mem.swapinRate.average",
    "mem.swapout.average",
    "mem.swapoutRate.average",
    "mem.usage.average",
    "mem.vmmemctl.average",
    "net.bytesRx.average",
    "net.bytesTx.average",
    "net.droppedRx.summation",
    "net.droppedTx.summation",
    "net.usage.average",
    "power.power.average",
    "virtualDisk.numberReadAveraged.average",
    "virtualDisk.numberWriteAveraged.average",
    "virtualDisk.read.average",
    "virtualDisk.readOIO.latest",
    "virtualDisk.throughput.usage.average",
    "virtualDisk.totalReadLatency.average",
    "virtualDisk.totalWriteLatency.average",
    "virtualDisk.write.average",
    "virtualDisk.writeOIO.latest",
    "sys.uptime.latest",
  ]
  # vm_metric_exclude = [] ## Nothing is excluded by default
  # vm_instances = true ## true by default
 
  ## Hosts
  ## Typical host metrics (if omitted or empty, all metrics are collected)
  # host_include = [ "/*/host/**"] # Inventory path to hosts to collect (by default all are collected)
  # host_exclude [] # Inventory paths to exclude
  host_metric_include = [
    "cpu.coreUtilization.average",
    "cpu.costop.summation",
    "cpu.demand.average",
    "cpu.idle.summation",
    "cpu.latency.average",
    "cpu.readiness.average",
    "cpu.ready.summation",
    "cpu.swapwait.summation",
    "cpu.usage.average",
    "cpu.usagemhz.average",
    "cpu.used.summation",
    "cpu.utilization.average",
    "cpu.wait.summation",
    "disk.deviceReadLatency.average",
    "disk.deviceWriteLatency.average",
    "disk.kernelReadLatency.average",
    "disk.kernelWriteLatency.average",
    "disk.numberReadAveraged.average",
    "disk.numberWriteAveraged.average",
    "disk.read.average",
    "disk.totalReadLatency.average",
    "disk.totalWriteLatency.average",
    "disk.write.average",
    "mem.active.average",
    "mem.latency.average",
    "mem.state.latest",
    "mem.swapin.average",
    "mem.swapinRate.average",
    "mem.swapout.average",
    "mem.swapoutRate.average",
    "mem.totalCapacity.average",
    "mem.usage.average",
    "mem.vmmemctl.average",
    "net.bytesRx.average",
    "net.bytesTx.average",
    "net.droppedRx.summation",
    "net.droppedTx.summation",
    "net.errorsRx.summation",
    "net.errorsTx.summation",
    "net.usage.average",
    "power.power.average",
    "storageAdapter.numberReadAveraged.average",
    "storageAdapter.numberWriteAveraged.average",
    "storageAdapter.read.average",
    "storageAdapter.write.average",
    "sys.uptime.latest",
  ]
    ## Collect IP addresses? Valid values are "ipv4" and "ipv6"
  # ip_addresses = ["ipv6", "ipv4" ]
 
  # host_metric_exclude = [] ## Nothing excluded by default
  # host_instances = true ## true by default
 
 
  ## Clusters
  # cluster_include = [ "/*/host/**"] # Inventory path to clusters to collect (by default all are collected)
  # cluster_exclude = [] # Inventory paths to exclude
  # cluster_metric_include = [] ## if omitted or empty, all metrics are collected
  # cluster_metric_exclude = [] ## Nothing excluded by default
  # cluster_instances = false ## false by default
 
  ## Resource Pools
  # resource_pool_include = [ "/*/host/**"] # Inventory path to resource pools to collect (by default all are collected)
  # resource_pool_exclude = [] # Inventory paths to exclude
  # resource_pool_metric_include = [] ## if omitted or empty, all metrics are collected
  # resource_pool_metric_exclude = [] ## Nothing excluded by default
  # resource_pool_instances = false ## false by default
 
  ## Datastores
  # datastore_include = [ "/*/datastore/**"] # Inventory path to datastores to collect (by default all are collected)
  # datastore_exclude = [] # Inventory paths to exclude
  # datastore_metric_include = [] ## if omitted or empty, all metrics are collected
  # datastore_metric_exclude = [] ## Nothing excluded by default
  # datastore_instances = false ## false by default
 
  ## Datacenters
  # datacenter_include = [ "/*/host/**"] # Inventory path to clusters to collect (by default all are collected)
  # datacenter_exclude = [] # Inventory paths to exclude
  datacenter_metric_include = [] ## if omitted or empty, all metrics are collected
  datacenter_metric_exclude = [ "*" ] ## Datacenters are not collected by default.
  # datacenter_instances = false ## false by default
 
  ## VSAN
  # vsan_metric_include = [] ## if omitted or empty, all metrics are collected
  # vsan_metric_exclude = [ "*" ] ## vSAN are not collected by default.
  ## Whether to skip verifying vSAN metrics against the ones from GetSupportedEntityTypes API.
  # vsan_metric_skip_verify = false ## false by default.
 
  ## Plugin Settings
  ## separator character to use for measurement and field names (default: "_")
  # separator = "_"
 
  ## number of objects to retrieve per query for realtime resources (vms and hosts)
  ## set to 64 for vCenter 5.5 and 6.0 (default: 256)
  # max_query_objects = 256
 
  ## number of metrics to retrieve per query for non-realtime resources (clusters and datastores)
  ## set to 64 for vCenter 5.5 and 6.0 (default: 256)
  # max_query_metrics = 256
 
  ## number of go routines to use for collection and discovery of objects and metrics
  # collect_concurrency = 1
  # discover_concurrency = 1
 
  ## the interval before (re)discovering objects subject to metrics collection (default: 300s)
  # object_discovery_interval = "300s"
 
  ## timeout applies to any of the api request made to vcenter
  # timeout = "60s"
 
  ## When set to true, all samples are sent as integers. This makes the output
  ## data types backwards compatible with Telegraf 1.9 or lower. Normally all
  ## samples from vCenter, with the exception of percentages, are integer
  ## values, but under some conditions, some averaging takes place internally in
  ## the plugin. Setting this flag to "false" will send values as floats to
  ## preserve the full precision when averaging takes place.
  # use_int_samples = true
 
  ## Custom attributes from vCenter can be very useful for queries in order to slice the
  ## metrics along different dimension and for forming ad-hoc relationships. They are disabled
  ## by default, since they can add a considerable amount of tags to the resulting metrics. To
  ## enable, simply set custom_attribute_exclude to [] (empty set) and use custom_attribute_include
  ## to select the attributes you want to include.
  ## By default, since they can add a considerable amount of tags to the resulting metrics. To
  ## enable, simply set custom_attribute_exclude to [] (empty set) and use custom_attribute_include
  ## to select the attributes you want to include.
  # custom_attribute_include = []
  # custom_attribute_exclude = ["*"]
 
  ## The number of vSphere 5 minute metric collection cycles to look back for non-realtime metrics. In
  ## some versions (6.7, 7.0 and possible more), certain metrics, such as cluster metrics, may be reported
  ## with a significant delay (>30min). If this happens, try increasing this number. Please note that increasing
  ## it too much may cause performance issues.
  # metric_lookback = 3
 
  ## Optional SSL Config
  # ssl_ca = "/path/to/cafile"
  # ssl_cert = "/path/to/certfile"
  # ssl_key = "/path/to/keyfile"
  ## Use SSL but skip chain & host verification
  # insecure_skip_verify = false
 
  ## The Historical Interval value must match EXACTLY the interval in the daily
  # "Interval Duration" found on the VCenter server under Configure > General > Statistics > Statistic intervals
  # historical_interval = "5m"
 
  ## Specifies plugin behavior regarding disconnected servers
  ## Available choices :
  ##   - error: telegraf will return an error on startup if one the servers is unreachable
  ##   - ignore: telegraf will ignore unreachable servers on both startup and gather
  # disconnected_servers_behavior = "error"

Read more about how to configure data scraping and configuration options for Apache Mesos (opens in a new tab)

Configure The Output plugin

Once you have generated the configuration file, you need to set up the output plug-in to allow Telegraf to transmit your data to Logit.io in Prometheus format. This can be accomplished by incorporating the following code into your configuration file:

[[outputs.http]]
  url = "https://@metricsUsername:@metricsPassword@@metrics_id-vm.logit.io:@vmAgentPort/api/v1/write"
  data_format = "prometheusremotewrite"
 
  [outputs.http.headers]
    Content-Type = "application/x-protobuf"
    Content-Encoding = "snappy"

Start Telegraf

From the location where Telegraf was installed (C:\Program Files\InfluxData\telegraf\telegraf-1.31.2) run the program providing the chosen configuration file as a parameter:

.\telegraf.exe --config telegraf-demo.conf

Once Telegraf is running you should see output similar to the following, which confirms the inputs, output and basic configuration the application has been started with: Powershell Telegraf information

View your metrics

Data should now have been sent to your Stack.

View My Data

If you don't see take a look at How to diagnose no data in Stack below for how to diagnose common issues.

How to diagnose no data in Stack

If you don't see data appearing in your Stack after following the steps, visit the Help Centre guide for steps to diagnose no data appearing in your Stack or Chat to support now.

Telegraf Apache Mesos Overview

For effective monitoring and analysis of Apache Mesos metrics across multiple systems, it's crucial to have a robust and efficient metrics management solution. Telegraf, an open-source server agent specifically designed for collecting and reporting metrics, fits this role perfectly. It can gather Apache Mesos metrics from numerous sources such as operational Apache Mesos instances, databases, and other relevant applications.

Telegraf boasts an extensive range of input plugins, enabling users to collect metrics from various sources like CPU usage, memory utilization, network traffic, and more. These metrics are vital for understanding Apache Mesos performance. To store and examine these harvested metrics, organizations can leverage Prometheus, an open-source monitoring and alerting toolkit known for its flexible querying language and impressive data visualization capabilities.

To relay Apache Mesos metrics from Telegraf to Prometheus, organizations need to configure Telegraf to output metrics in the Prometheus format, and then set up Prometheus to scrape these metrics from the Telegraf server. This process involves setting up Telegraf to collect Apache Mesos metrics, outputting them in the Prometheus format, configuring Prometheus to retrieve these metrics from the Telegraf server, and then visually interpreting the data using Prometheus's dynamic querying and graphical visualization tools.

Once the metrics are successfully integrated into Prometheus, further analysis and visualization can be carried out using Grafana. Grafana, a leading open-source platform recognized for its monitoring and observability features, is fully compatible with Prometheus. It allows users to create dynamic, interactive dashboards for a deep-dive into the metrics data, providing a comprehensive understanding of performance trends and potential issues within the Apache Mesos system.

If you need any further assistance with shipping your log data to Logit.io we're here to help you get started. Feel free to get in contact with our support team by sending us a message via live chat & we'll be happy to assist.