Skip to content

Commit

Permalink
change the calculation of gpushare allocated gpus (#552)
Browse files Browse the repository at this point in the history
* change calculation of gpushare allocated gpus

* update version to 0.8.3
  • Loading branch information
happy2048 authored May 7, 2021
1 parent ed79092 commit bdce431
Show file tree
Hide file tree
Showing 8 changed files with 84 additions and 78 deletions.
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.8.2
0.8.3
6 changes: 3 additions & 3 deletions pkg/apis/types/gpunode.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,9 @@ type CommonGPUNodeInfo struct {
*/

type CommonGPUNodeInfo struct {
TotalGPUs int `json:"totalGPUs" yaml:"totalGPUs"`
AllocatedGPUs int `json:"allocatedGPUs" yaml:"allocatedGPUs"`
UnhealthyGPUs int `json:"unhealthyGPUs" yaml:"unhealthyGPUs"`
TotalGPUs float64 `json:"totalGPUs" yaml:"totalGPUs"`
AllocatedGPUs float64 `json:"allocatedGPUs" yaml:"allocatedGPUs"`
UnhealthyGPUs float64 `json:"unhealthyGPUs" yaml:"unhealthyGPUs"`
GPUMetrics []*AdvancedGpuMetric `json:"gpuMetrics" yaml:"gpuMetrics"`
}

Expand Down
12 changes: 6 additions & 6 deletions pkg/topnode/define.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ type NodeProcesser interface {
// DisplayNodesDetails display nodes which the processer knowns
DisplayNodesDetails(w *tabwriter.Writer, nodes []Node)
// DisplayNodesSummary display nodes summary
DisplayNodesSummary(w *tabwriter.Writer, nodes []Node, showNodeType, isUnhealthy bool) (int, int, int)
DisplayNodesSummary(w *tabwriter.Writer, nodes []Node, showNodeType, isUnhealthy bool) (float64, float64, float64)
// DisplayNodesCustomSummary display custom format of target type nodes
DisplayNodesCustomSummary(w *tabwriter.Writer, nodes []Node)
// SupportedNodeType Type returns the supported node type
Expand Down Expand Up @@ -146,7 +146,7 @@ type nodeProcesser struct {
builder func(client *kubernetes.Clientset, node *v1.Node, index int, args buildNodeArgs) (Node, error)
canBuildNode func(node *v1.Node) bool
displayNodesDetails func(w *tabwriter.Writer, nodes []Node)
displayNodesSummary func(w *tabwriter.Writer, nodes []Node, isUnhealthy, showNodeType bool) (int, int, int)
displayNodesSummary func(w *tabwriter.Writer, nodes []Node, isUnhealthy, showNodeType bool) (float64, float64, float64)
displayNodesCustomSummary func(w *tabwriter.Writer, nodes []Node)
}

Expand Down Expand Up @@ -190,10 +190,10 @@ func (n *nodeProcesser) DisplayNodesDetails(w *tabwriter.Writer, nodes []Node) {
n.displayNodesDetails(w, myNodes)
}

func (n *nodeProcesser) DisplayNodesSummary(w *tabwriter.Writer, nodes []Node, showNodeType, isUnhealthy bool) (int, int, int) {
totalGPUs := 0
allocatedGPUs := 0
unhealthyGPUs := 0
func (n *nodeProcesser) DisplayNodesSummary(w *tabwriter.Writer, nodes []Node, showNodeType, isUnhealthy bool) (float64, float64, float64) {
totalGPUs := float64(0)
allocatedGPUs := float64(0)
unhealthyGPUs := float64(0)
myNodes := []Node{}
for _, node := range nodes {
if node.Type() != n.nodeType {
Expand Down
40 changes: 20 additions & 20 deletions pkg/topnode/gpuexclusive.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,18 +63,18 @@ func (g *gpuexclusive) gpuMetricsIsEnabled() bool {
return len(g.gpuMetrics) != 0
}

func (g *gpuexclusive) getTotalGPUs() int {
func (g *gpuexclusive) getTotalGPUs() float64 {
if len(g.gpuMetrics) != 0 {
return len(g.gpuMetrics)
return float64(len(g.gpuMetrics))
}
val, ok := g.node.Status.Capacity[v1.ResourceName(types.NvidiaGPUResourceName)]
if !ok {
return 0
}
return int(val.Value())
return float64(val.Value())
}

func (g *gpuexclusive) getAllocatedGPUs() int {
func (g *gpuexclusive) getAllocatedGPUs() float64 {
allocatedGPUs := 0
for _, pod := range g.pods {
if utils.IsCompletedPod(pod) {
Expand All @@ -83,7 +83,7 @@ func (g *gpuexclusive) getAllocatedGPUs() int {
allocation := utils.GPUCountInPod(pod)
allocatedGPUs += allocation
}
return allocatedGPUs
return float64(allocatedGPUs)
}

func (g *gpuexclusive) getTotalGPUMemory() float64 {
Expand Down Expand Up @@ -145,7 +145,7 @@ func (g *gpuexclusive) getDutyCycle() float64 {
return dutyCycle / totalGPUs
}

func (g *gpuexclusive) getUnhealthyGPUs() int {
func (g *gpuexclusive) getUnhealthyGPUs() float64 {
totalGPUs := g.getTotalGPUs()
allocatableGPUs, ok := g.node.Status.Allocatable[v1.ResourceName(types.NvidiaGPUResourceName)]
if !ok {
Expand All @@ -154,7 +154,7 @@ func (g *gpuexclusive) getUnhealthyGPUs() int {
if totalGPUs <= 0 {
return 0
}
return totalGPUs - int(allocatableGPUs.Value())
return totalGPUs - float64(allocatableGPUs.Value())
}

func (g *gpuexclusive) getTotalGPUMemoryOfDevice(id string) float64 {
Expand All @@ -180,8 +180,8 @@ func (g *gpuexclusive) convert2NodeInfo() types.GPUExclusiveNodeInfo {
},
CommonGPUNodeInfo: types.CommonGPUNodeInfo{
TotalGPUs: g.getTotalGPUs(),
AllocatedGPUs: g.getAllocatedGPUs(),
UnhealthyGPUs: g.getUnhealthyGPUs(),
AllocatedGPUs: g.getAllocatedGPUs(),
GPUMetrics: metrics,
},
}
Expand Down Expand Up @@ -306,7 +306,7 @@ func (g *gpuexclusive) displayDeviceInfoUnderMetrics(lines []string, nodeInfo ty
totalGPUMemory := float64(0)
totalAllocatedGPUMemory := g.getAllocatedGPUMemory()
totalUsedGPUMemory := float64(0)
for i := 0; i < nodeInfo.TotalGPUs; i++ {
for i := 0; i < int(nodeInfo.TotalGPUs); i++ {
gpuId := fmt.Sprintf("%v", i)
devInfo, ok := deviceMap[gpuId]
if !ok {
Expand Down Expand Up @@ -390,9 +390,9 @@ func displayGPUExclusiveNodeDetails(w *tabwriter.Writer, nodes []Node) {
if len(nodes) == 0 {
return
}
totalGPUs := 0
totalUnhealthyGPUs := 0
totalAllocatedGPUs := 0
totalGPUs := float64(0)
totalUnhealthyGPUs := float64(0)
totalAllocatedGPUs := float64(0)
for _, node := range nodes {
nodeInfo := node.Convert2NodeInfo().(types.GPUExclusiveNodeInfo)
totalGPUs += nodeInfo.TotalGPUs
Expand All @@ -402,10 +402,10 @@ func displayGPUExclusiveNodeDetails(w *tabwriter.Writer, nodes []Node) {
}
}

func displayGPUExclusiveNodeSummary(w *tabwriter.Writer, nodes []Node, isUnhealthy, showNodeType bool) (int, int, int) {
totalGPUs := 0
allocatedGPUs := 0
unhealthyGPUs := 0
func displayGPUExclusiveNodeSummary(w *tabwriter.Writer, nodes []Node, isUnhealthy, showNodeType bool) (float64, float64, float64) {
totalGPUs := float64(0)
allocatedGPUs := float64(0)
unhealthyGPUs := float64(0)
for _, node := range nodes {
nodeInfo := node.Convert2NodeInfo().(types.GPUExclusiveNodeInfo)
totalGPUs += nodeInfo.TotalGPUs
Expand Down Expand Up @@ -434,7 +434,7 @@ func displayGPUExclusiveNodeSummary(w *tabwriter.Writer, nodes []Node, isUnhealt
}
PrintLine(w, items...)
}
return totalGPUs, allocatedGPUs, unhealthyGPUs
return totalGPUs, float64(allocatedGPUs), unhealthyGPUs
}

func displayGPUExclusiveNodesCustomSummary(w *tabwriter.Writer, nodes []Node) {
Expand All @@ -452,9 +452,9 @@ func displayGPUExclusiveNodesCustomSummary(w *tabwriter.Writer, nodes []Node) {
header = append(header, "UNHEALTHY")
}
PrintLine(w, header...)
totalGPUs := 0
allocatedGPUs := 0
unhealthyGPUs := 0
totalGPUs := float64(0)
allocatedGPUs := float64(0)
unhealthyGPUs := float64(0)
for _, node := range nodes {
nodeInfo := node.Convert2NodeInfo().(types.GPUExclusiveNodeInfo)
totalGPUs += nodeInfo.TotalGPUs
Expand Down
56 changes: 31 additions & 25 deletions pkg/topnode/gpushare.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package topnode

import (
"fmt"
"math"
"strings"
"text/tabwriter"

Expand Down Expand Up @@ -63,29 +64,34 @@ func (g *gpushare) gpuMetricsIsEnabled() bool {
return len(g.gpuMetrics) != 0
}

func (g *gpushare) getTotalGPUs() int {
func (g *gpushare) getTotalGPUs() float64 {
if len(g.gpuMetrics) != 0 {
return len(g.gpuMetrics)
return float64(len(g.gpuMetrics))
}
val, ok := g.node.Status.Capacity[v1.ResourceName(types.GPUShareCountName)]
if !ok {
return 0
}
return int(val.Value())
return float64(val.Value())
}

func (g *gpushare) getAllocatedGPUs() int {
allocatedGPUs := map[string]bool{}
func (g *gpushare) getAllocatedGPUs() float64 {
total := float64(0)
for _, pod := range g.pods {
if utils.IsCompletedPod(pod) {
continue
}
allocation := utils.GetPodAllocation(pod)
for key := range allocation {
allocatedGPUs[key] = true
for key, allocatedGPUMemory := range allocation {
totalGPUMemory := g.getTotalGPUMemoryOfDevice(key)
if totalGPUMemory == 0 {
continue
}
totalGPUMemory = utils.DataUnitTransfer("bytes", "GiB", totalGPUMemory)
total += float64(allocatedGPUMemory) / totalGPUMemory
}
}
return len(allocatedGPUs)
return math.Round(total*10) / 10
}

func (g *gpushare) getTotalGPUMemory() float64 {
Expand Down Expand Up @@ -145,7 +151,7 @@ func (g *gpushare) getDutyCycle() float64 {
return dutyCycle / totalGPUs
}

func (g *gpushare) getUnhealthyGPUs() int {
func (g *gpushare) getUnhealthyGPUs() float64 {
totalGPUs := g.getTotalGPUs()
totalGPUMemory, ok := g.node.Status.Capacity[v1.ResourceName(types.GPUShareResourceName)]
if !ok {
Expand All @@ -162,7 +168,7 @@ func (g *gpushare) getUnhealthyGPUs() int {
return totalGPUs
}
unhealthyGPUMemory := totalGPUMemory.Value() - allocatableGPUMemory.Value()
return int(int64(totalGPUs) * unhealthyGPUMemory / totalGPUMemory.Value())
return float64(int64(totalGPUs) * unhealthyGPUMemory / totalGPUMemory.Value())
}

func (g *gpushare) getTotalGPUMemoryOfDevice(id string) float64 {
Expand Down Expand Up @@ -198,15 +204,15 @@ func (g *gpushare) convert2NodeInfo() types.GPUShareNodeInfo {
TotalGPUMemory: g.getTotalGPUMemory(),
AllocatedGPUMemory: g.getAllocatedGPUMemory(),
CommonGPUNodeInfo: types.CommonGPUNodeInfo{
TotalGPUs: g.getTotalGPUs(),
AllocatedGPUs: g.getAllocatedGPUs(),
TotalGPUs: g.getTotalGPUs(),
UnhealthyGPUs: g.getUnhealthyGPUs(),
GPUMetrics: metrics,
},
}
// build devices
deviceMap := map[string]types.GPUShareNodeDevice{}
for i := 0; i < g.getTotalGPUs(); i++ {
for i := 0; i < int(g.getTotalGPUs()); i++ {
gpuId := fmt.Sprintf("%v", i)
deviceMap[gpuId] = types.GPUShareNodeDevice{
Id: gpuId,
Expand Down Expand Up @@ -290,7 +296,7 @@ func (g *gpushare) displayPodInfos(lines []string, nodeInfo types.GPUShareNodeIn
}
for _, podInfo := range nodeInfo.PodInfos {
items := []string{}
for i := 0; i < nodeInfo.TotalGPUs; i++ {
for i := 0; i < int(nodeInfo.TotalGPUs); i++ {
gpuId := fmt.Sprintf("%v", i)
count, ok := podInfo.Allocation[gpuId]
if !ok {
Expand Down Expand Up @@ -321,7 +327,7 @@ func (g *gpushare) displayDeviceUnderNoGPUMetric(lines []string, nodeInfo types.
for _, dev := range nodeInfo.Devices {
deviceMap[dev.Id] = dev
}
for i := 0; i < nodeInfo.TotalGPUs; i++ {
for i := 0; i < int(nodeInfo.TotalGPUs); i++ {
percent := float64(0)
gpuId := fmt.Sprintf("%v", i)
devInfo, ok := deviceMap[gpuId]
Expand Down Expand Up @@ -359,7 +365,7 @@ func (g *gpushare) displayDeviceUnderGPUMetric(lines []string, nodeInfo types.GP
for _, dev := range g.gpuMetrics {
deviceMap[dev.Id] = dev
}
for i := 0; i < nodeInfo.TotalGPUs; i++ {
for i := 0; i < int(nodeInfo.TotalGPUs); i++ {
gpuId := fmt.Sprintf("%v", i)
devInfo, ok := deviceMap[gpuId]
if !ok {
Expand Down Expand Up @@ -442,10 +448,10 @@ func displayGPUShareNodeDetails(w *tabwriter.Writer, nodes []Node) {
return
}
totalGPUMemory := float64(0)
totalGPUs := int(0)
allocatedGPUs := int(0)
totalGPUs := float64(0)
allocatedGPUs := float64(0)
allocatedGPUMemory := float64(0)
unhealthyGPUs := int(0)
unhealthyGPUs := float64(0)
for _, node := range nodes {
nodeInfo := node.Convert2NodeInfo().(types.GPUShareNodeInfo)
totalGPUs += nodeInfo.TotalGPUs
Expand All @@ -457,10 +463,10 @@ func displayGPUShareNodeDetails(w *tabwriter.Writer, nodes []Node) {
}
}

func displayGPUShareNodeSummary(w *tabwriter.Writer, nodes []Node, isUnhealthy, showNodeType bool) (int, int, int) {
totalGPUs := 0
allocatedGPUs := 0
unhealthyGPUs := 0
func displayGPUShareNodeSummary(w *tabwriter.Writer, nodes []Node, isUnhealthy, showNodeType bool) (float64, float64, float64) {
totalGPUs := float64(0)
allocatedGPUs := float64(0)
unhealthyGPUs := float64(0)
for _, node := range nodes {
nodeInfo := node.Convert2NodeInfo().(types.GPUShareNodeInfo)
totalGPUs += nodeInfo.TotalGPUs
Expand Down Expand Up @@ -507,11 +513,11 @@ func displayGPUShareNodesCustomSummary(w *tabwriter.Writer, nodes []Node) {
header = append(header, "UNHEALTHY")
}
PrintLine(w, header...)
totalGPUs := 0
totalGPUs := float64(0)
totalGPUMemory := float64(0)
allocatedGPUMemory := float64(0)
allocatedGPUs := 0
unhealthyGPUs := 0
allocatedGPUs := float64(0)
unhealthyGPUs := float64(0)
for _, node := range nodes {
nodeInfo := node.Convert2NodeInfo().(types.GPUShareNodeInfo)
totalGPUs += nodeInfo.TotalGPUs
Expand Down
Loading

0 comments on commit bdce431

Please sign in to comment.