nvidia tested and working, added options for web and api port in jenkinsfile
This commit is contained in:
@ -49,16 +49,25 @@ class Component:
|
||||
# store static properties
|
||||
self.multi_check = self.is_multi()
|
||||
self.virt_ignore = self._descriptor.get('virt_ignore', [])
|
||||
self.multi_metrics = self._descriptor.get('multi_metrics', [])
|
||||
#if 'precheck' in self._descriptor:
|
||||
# precheck_command = self._descriptor.get('precheck', [])
|
||||
# precheck_value = int(run_command(precheck_command, zero_only = True))
|
||||
# if precheck_value == 0:
|
||||
# raise ValueError(f"No devices of type {self.type}")
|
||||
if self.is_virtual:
|
||||
self.virt_ignore = []
|
||||
self._properties: Dict[str, str] = {}
|
||||
self._properties: Dict[str, str | list[str]] = {}
|
||||
for key, command in descriptor.get('properties', {}).items():
|
||||
return_string = True
|
||||
if key in self.multi_metrics:
|
||||
return_string = False
|
||||
if self.this_device != "None":
|
||||
# this means this component type is a multi and the commands need templating for each device
|
||||
formatted_command = command.format(this_device=self.this_device)
|
||||
self._properties[key] = run_command(formatted_command, True)
|
||||
self._properties[key] = run_command(formatted_command, zero_only = return_string)
|
||||
else:
|
||||
self._properties[key] = run_command(command, zero_only = True)
|
||||
self._properties[key] = run_command(command, zero_only = return_string)
|
||||
print(self._properties[key])
|
||||
# build the description string
|
||||
self._description_template: str | None = descriptor.get("description")
|
||||
@ -114,31 +123,32 @@ class Component:
|
||||
component_properties = self._properties.items()
|
||||
else:
|
||||
component_properties = self.get_property(component)
|
||||
for name, value in component_properties:
|
||||
this_property = {
|
||||
"Source": self.name,
|
||||
"Property": name,
|
||||
"Value": value
|
||||
}
|
||||
if name not in self.virt_ignore:
|
||||
result.append(this_property)
|
||||
for name, values in component_properties:
|
||||
for value in (values if isinstance(values, list) else [values]):
|
||||
this_property = {
|
||||
"Source": self.name,
|
||||
"Property": name,
|
||||
"Value": value
|
||||
}
|
||||
if name not in self.virt_ignore:
|
||||
result.append(this_property)
|
||||
return result
|
||||
|
||||
def get_properties_strings(self, return_simple = False):
|
||||
result = []
|
||||
component_properties = self._properties.items()
|
||||
print(component_properties)
|
||||
for name, value in component_properties:
|
||||
simple_property = f"{name}: {value}"
|
||||
complex_property = {
|
||||
"Source": self.name,
|
||||
"Property": simple_property
|
||||
}
|
||||
if name not in self.virt_ignore:
|
||||
if return_simple:
|
||||
result.append(simple_property)
|
||||
else:
|
||||
result.append(complex_property)
|
||||
for name, values in component_properties:
|
||||
for value in (values if isinstance(values, list) else [values]):
|
||||
simple_property = f"{name}: {value}"
|
||||
complex_property = {
|
||||
"Source": self.name,
|
||||
"Property": simple_property
|
||||
}
|
||||
if name not in self.virt_ignore:
|
||||
if return_simple:
|
||||
result.append(simple_property)
|
||||
else:
|
||||
result.append(complex_property)
|
||||
return result
|
||||
|
||||
def get_metrics_keys(self):
|
||||
@ -318,15 +328,15 @@ class System:
|
||||
multi_check = component["multi_check"]
|
||||
# if multi, note that the command in device_list creates the list of things to pipe into this_device
|
||||
if multi_check:
|
||||
letters = [chr(c) for c in range(ord('A'), ord('Z')+1)]
|
||||
print(f"Creating one component of type {component_name} for each one found")
|
||||
component_type_device_list = get_device_list(component_name)
|
||||
|
||||
component_id = 0
|
||||
for this_device in component_type_device_list:
|
||||
this_component_letter = letters[component_type_device_list.index(this_device)]
|
||||
this_component_name = f"{component_name} {this_component_letter}"
|
||||
this_component_ID = component_type_device_list.index(this_device)
|
||||
this_component_name = f"{component_name} {this_component_ID}"
|
||||
print(f"{this_component_name} - {component_name} - {this_device}")
|
||||
self.add_components(Component(name = this_component_name, comp_type = component_name, this_device = this_device))
|
||||
new_component = Component(name = this_component_name, comp_type = component_name, this_device = this_device)
|
||||
self.add_components(new_component)
|
||||
|
||||
else:
|
||||
if debug_output:
|
||||
@ -538,7 +548,13 @@ def run_command(cmd, zero_only=False, use_shell=True, req_check = True):
|
||||
def get_device_list(device_type_name: str):
|
||||
result = []
|
||||
for component in component_class_tree:
|
||||
if component["name"] == device_type_name:
|
||||
precheck_value = 1
|
||||
if "precheck" in component:
|
||||
precheck_command = component["precheck"]
|
||||
precheck_value_output = run_command(precheck_command, zero_only = True)
|
||||
precheck_value = int(precheck_value_output)
|
||||
print(f"Precheck found - {precheck_command} - {precheck_value}")
|
||||
if component["name"] == device_type_name and precheck_value != 0:
|
||||
device_list_command = component["device_list"]
|
||||
device_list_result = run_command(device_list_command)
|
||||
result = device_list_result
|
||||
|
||||
@ -21,7 +21,8 @@ app_settings = {
|
||||
"secure_api" : True,
|
||||
"push_redis" : False,
|
||||
"run_background" : True,
|
||||
"update_frequency": 1
|
||||
"update_frequency": 1,
|
||||
"custom_api_port": "5000"
|
||||
}
|
||||
|
||||
with open('cosmostat_settings.yaml', 'r') as f:
|
||||
@ -60,6 +61,9 @@ def service_gateway_ip():
|
||||
else:
|
||||
return "0.0.0.0"
|
||||
|
||||
def service_api_port():
|
||||
return cosmostat_settings["custom_api_port"]
|
||||
|
||||
#######################################################################
|
||||
### Redis Functions
|
||||
#######################################################################
|
||||
@ -295,7 +299,7 @@ if __name__ == '__main__':
|
||||
print("Skipping flask background task")
|
||||
|
||||
# Flask API
|
||||
app.run(debug=False, host=service_gateway_ip(), port=5000)
|
||||
app.run(debug=False, host=service_gateway_ip(), port=service_api_port())
|
||||
|
||||
|
||||
|
||||
|
||||
@ -51,5 +51,48 @@
|
||||
"metrics": {
|
||||
"placeholder": ""
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "LAN",
|
||||
"description": "{Device ID} - {Device Name} - {MAC Address}",
|
||||
"multi_check": "True",
|
||||
"device_list": "ip link | grep default | grep -v -e docker -e 127.0.0.1 -e br- -e veth -e lo -e tun | cut -d ':' -f 2 | awk '{{print $1}}' ",
|
||||
"properties": {
|
||||
"MAC Address": "ip link | grep -A1 ' {this_device}' | grep ether | awk '{{print $2}}'",
|
||||
"Device Name": "echo {this_device}",
|
||||
"Device ID": "udevadm info -q property -p $(ls -l /sys/class/net/ | grep {this_device} | cut -d '>' -f2 | cut -b 8- ) | grep ID_MODEL_FROM_DATABASE | cut -d '=' -f2 "
|
||||
},
|
||||
"metrics": {
|
||||
"IP Address": "ip -o -4 ad | grep -v -e docker -e 127.0.0.1 -e br- | grep {this_device} | awk '{{print $4}}'",
|
||||
"Data Transmitted": "ifconfig {this_device} | grep RX | grep bytes | cut -d '(' -f2 | tr -d ')'",
|
||||
"Data Received": "ifconfig {this_device} | grep TX | grep bytes | cut -d '(' -f2 | tr -d ')'",
|
||||
"Link State": "cat /sys/class/net/{this_device}/operstate",
|
||||
"Link Speed": "cat /sys/class/net/{this_device}/speed"
|
||||
},
|
||||
"multi_metrics": [
|
||||
"IP Address"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "NVGPU",
|
||||
"description": "NVGPU{Device ID} - {Device Model} with {Memory Size}, Max Power {Maximum Power}",
|
||||
"multi_check": "True",
|
||||
"device_list": "nvidia-smi --query-gpu=index --format=csv,noheader,nounits",
|
||||
"properties": {
|
||||
"Device Model": "nvidia-smi --id={this_device} --query-gpu=name --format=csv,noheader,nounits",
|
||||
"Device ID": "echo NVGPU{this_device}",
|
||||
"Driver Version": "nvidia-smi --id={this_device} --query-gpu=driver_version --format=csv,noheader,nounits",
|
||||
"Maximum Power": "nvidia-smi --id={this_device} --query-gpu=power.draw --format=csv,noheader,nounits",
|
||||
"Memory Size": "nvidia-smi --id={this_device} --query-gpu=memory.total --format=csv,noheader,nounits"
|
||||
|
||||
},
|
||||
"metrics": {
|
||||
"Power Draw": "nvidia-smi --id={this_device} --query-gpu=power.draw --format=csv,noheader,nounits",
|
||||
"Used Memory": "nvidia-smi --id={this_device} --query-gpu=memory.used --format=csv,noheader,nounits",
|
||||
"Temperature": "nvidia-smi --id={this_device} --query-gpu=temperature.gpu --format=csv,noheader,nounits",
|
||||
"GPU Load": "nvidia-smi --id={this_device} --query-gpu=utilization.gpu --format=csv,noheader,nounits"
|
||||
|
||||
},
|
||||
"precheck": "lspci | grep NV | wc -l"
|
||||
}
|
||||
]
|
||||
@ -1,24 +1,22 @@
|
||||
[
|
||||
{
|
||||
"name": "LAN",
|
||||
"name": "",
|
||||
"description": "",
|
||||
"multi_check": "True",
|
||||
"device_list": "",
|
||||
"device_list": " ",
|
||||
"properties": {
|
||||
"MAC Address": "",
|
||||
"Device Name": "",
|
||||
"Device ID": ""
|
||||
|
||||
},
|
||||
"metrics": {
|
||||
"IP Address": "",
|
||||
"MB Transmitted": "",
|
||||
"MB Received": "",
|
||||
"Link State": "",
|
||||
"Link Speed": ""
|
||||
}
|
||||
},
|
||||
"multi_metrics": [
|
||||
],
|
||||
"virt_ignore": [
|
||||
]
|
||||
},
|
||||
{
|
||||
"SATA GBW": "sudo /usr/sbin/smartctl -x --json /dev/{this_device} | jq -r '.physical_block_size as $block |.ata_device_statistics.pages[] | select(.name == \"General Statistics\") | .table[] | select(.name == \"Logical Sectors Written\") | .value as $sectors | ($sectors * $block) / 1073741824 ' | awk '{{printf \"%.2f GiB Written\\n\", $0}}' || true",
|
||||
"NVMe GBW": "sudo /usr/sbin/smartctl -x --json /dev/{this_device} | jq -r ' .nvme_smart_health_information_log.data_units_written as $dw | .logical_block_size as $ls | ($dw * $ls) / 1073741824 ' | awk '{{printf \"%.2f GiB Written\\n\", $0}}' || true"
|
||||
}
|
||||
]
|
||||
]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user