nvidia tested and working, added options for web and api port in jenkinsfile

This commit is contained in:
2026-03-19 00:30:12 -07:00
parent d7d2507d43
commit cf269b83af
11 changed files with 172 additions and 60 deletions

View File

@ -49,16 +49,25 @@ class Component:
# store static properties
self.multi_check = self.is_multi()
self.virt_ignore = self._descriptor.get('virt_ignore', [])
self.multi_metrics = self._descriptor.get('multi_metrics', [])
#if 'precheck' in self._descriptor:
# precheck_command = self._descriptor.get('precheck', [])
# precheck_value = int(run_command(precheck_command, zero_only = True))
# if precheck_value == 0:
# raise ValueError(f"No devices of type {self.type}")
if self.is_virtual:
self.virt_ignore = []
self._properties: Dict[str, str] = {}
self._properties: Dict[str, str | list[str]] = {}
for key, command in descriptor.get('properties', {}).items():
return_string = True
if key in self.multi_metrics:
return_string = False
if self.this_device != "None":
# this means this component type is a multi and the commands need templating for each device
formatted_command = command.format(this_device=self.this_device)
self._properties[key] = run_command(formatted_command, True)
self._properties[key] = run_command(formatted_command, zero_only = return_string)
else:
self._properties[key] = run_command(command, zero_only = True)
self._properties[key] = run_command(command, zero_only = return_string)
print(self._properties[key])
# build the description string
self._description_template: str | None = descriptor.get("description")
@ -114,31 +123,32 @@ class Component:
component_properties = self._properties.items()
else:
component_properties = self.get_property(component)
for name, value in component_properties:
this_property = {
"Source": self.name,
"Property": name,
"Value": value
}
if name not in self.virt_ignore:
result.append(this_property)
for name, values in component_properties:
for value in (values if isinstance(values, list) else [values]):
this_property = {
"Source": self.name,
"Property": name,
"Value": value
}
if name not in self.virt_ignore:
result.append(this_property)
return result
def get_properties_strings(self, return_simple = False):
result = []
component_properties = self._properties.items()
print(component_properties)
for name, value in component_properties:
simple_property = f"{name}: {value}"
complex_property = {
"Source": self.name,
"Property": simple_property
}
if name not in self.virt_ignore:
if return_simple:
result.append(simple_property)
else:
result.append(complex_property)
for name, values in component_properties:
for value in (values if isinstance(values, list) else [values]):
simple_property = f"{name}: {value}"
complex_property = {
"Source": self.name,
"Property": simple_property
}
if name not in self.virt_ignore:
if return_simple:
result.append(simple_property)
else:
result.append(complex_property)
return result
def get_metrics_keys(self):
@ -318,15 +328,15 @@ class System:
multi_check = component["multi_check"]
# if multi, note that the command in device_list creates the list of things to pipe into this_device
if multi_check:
letters = [chr(c) for c in range(ord('A'), ord('Z')+1)]
print(f"Creating one component of type {component_name} for each one found")
component_type_device_list = get_device_list(component_name)
component_id = 0
for this_device in component_type_device_list:
this_component_letter = letters[component_type_device_list.index(this_device)]
this_component_name = f"{component_name} {this_component_letter}"
this_component_ID = component_type_device_list.index(this_device)
this_component_name = f"{component_name} {this_component_ID}"
print(f"{this_component_name} - {component_name} - {this_device}")
self.add_components(Component(name = this_component_name, comp_type = component_name, this_device = this_device))
new_component = Component(name = this_component_name, comp_type = component_name, this_device = this_device)
self.add_components(new_component)
else:
if debug_output:
@ -538,7 +548,13 @@ def run_command(cmd, zero_only=False, use_shell=True, req_check = True):
def get_device_list(device_type_name: str):
result = []
for component in component_class_tree:
if component["name"] == device_type_name:
precheck_value = 1
if "precheck" in component:
precheck_command = component["precheck"]
precheck_value_output = run_command(precheck_command, zero_only = True)
precheck_value = int(precheck_value_output)
print(f"Precheck found - {precheck_command} - {precheck_value}")
if component["name"] == device_type_name and precheck_value != 0:
device_list_command = component["device_list"]
device_list_result = run_command(device_list_command)
result = device_list_result

View File

@ -21,7 +21,8 @@ app_settings = {
"secure_api" : True,
"push_redis" : False,
"run_background" : True,
"update_frequency": 1
"update_frequency": 1,
"custom_api_port": "5000"
}
with open('cosmostat_settings.yaml', 'r') as f:
@ -60,6 +61,9 @@ def service_gateway_ip():
else:
return "0.0.0.0"
def service_api_port():
return cosmostat_settings["custom_api_port"]
#######################################################################
### Redis Functions
#######################################################################
@ -295,7 +299,7 @@ if __name__ == '__main__':
print("Skipping flask background task")
# Flask API
app.run(debug=False, host=service_gateway_ip(), port=5000)
app.run(debug=False, host=service_gateway_ip(), port=service_api_port())

View File

@ -51,5 +51,48 @@
"metrics": {
"placeholder": ""
}
},
{
"name": "LAN",
"description": "{Device ID} - {Device Name} - {MAC Address}",
"multi_check": "True",
"device_list": "ip link | grep default | grep -v -e docker -e 127.0.0.1 -e br- -e veth -e lo -e tun | cut -d ':' -f 2 | awk '{{print $1}}' ",
"properties": {
"MAC Address": "ip link | grep -A1 ' {this_device}' | grep ether | awk '{{print $2}}'",
"Device Name": "echo {this_device}",
"Device ID": "udevadm info -q property -p $(ls -l /sys/class/net/ | grep {this_device} | cut -d '>' -f2 | cut -b 8- ) | grep ID_MODEL_FROM_DATABASE | cut -d '=' -f2 "
},
"metrics": {
"IP Address": "ip -o -4 ad | grep -v -e docker -e 127.0.0.1 -e br- | grep {this_device} | awk '{{print $4}}'",
"Data Transmitted": "ifconfig {this_device} | grep RX | grep bytes | cut -d '(' -f2 | tr -d ')'",
"Data Received": "ifconfig {this_device} | grep TX | grep bytes | cut -d '(' -f2 | tr -d ')'",
"Link State": "cat /sys/class/net/{this_device}/operstate",
"Link Speed": "cat /sys/class/net/{this_device}/speed"
},
"multi_metrics": [
"IP Address"
]
},
{
"name": "NVGPU",
"description": "NVGPU{Device ID} - {Device Model} with {Memory Size}, Max Power {Maximum Power}",
"multi_check": "True",
"device_list": "nvidia-smi --query-gpu=index --format=csv,noheader,nounits",
"properties": {
"Device Model": "nvidia-smi --id={this_device} --query-gpu=name --format=csv,noheader,nounits",
"Device ID": "echo NVGPU{this_device}",
"Driver Version": "nvidia-smi --id={this_device} --query-gpu=driver_version --format=csv,noheader,nounits",
"Maximum Power": "nvidia-smi --id={this_device} --query-gpu=power.draw --format=csv,noheader,nounits",
"Memory Size": "nvidia-smi --id={this_device} --query-gpu=memory.total --format=csv,noheader,nounits"
},
"metrics": {
"Power Draw": "nvidia-smi --id={this_device} --query-gpu=power.draw --format=csv,noheader,nounits",
"Used Memory": "nvidia-smi --id={this_device} --query-gpu=memory.used --format=csv,noheader,nounits",
"Temperature": "nvidia-smi --id={this_device} --query-gpu=temperature.gpu --format=csv,noheader,nounits",
"GPU Load": "nvidia-smi --id={this_device} --query-gpu=utilization.gpu --format=csv,noheader,nounits"
},
"precheck": "lspci | grep NV | wc -l"
}
]

View File

@ -1,24 +1,22 @@
[
{
"name": "LAN",
"name": "",
"description": "",
"multi_check": "True",
"device_list": "",
"device_list": " ",
"properties": {
"MAC Address": "",
"Device Name": "",
"Device ID": ""
},
"metrics": {
"IP Address": "",
"MB Transmitted": "",
"MB Received": "",
"Link State": "",
"Link Speed": ""
}
},
"multi_metrics": [
],
"virt_ignore": [
]
},
{
"SATA GBW": "sudo /usr/sbin/smartctl -x --json /dev/{this_device} | jq -r '.physical_block_size as $block |.ata_device_statistics.pages[] | select(.name == \"General Statistics\") | .table[] | select(.name == \"Logical Sectors Written\") | .value as $sectors | ($sectors * $block) / 1073741824 ' | awk '{{printf \"%.2f GiB Written\\n\", $0}}' || true",
"NVMe GBW": "sudo /usr/sbin/smartctl -x --json /dev/{this_device} | jq -r ' .nvme_smart_health_information_log.data_units_written as $dw | .logical_block_size as $ls | ($dw * $ls) / 1073741824 ' | awk '{{printf \"%.2f GiB Written\\n\", $0}}' || true"
}
]
]