Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added script for PyTorch issues debugging #137

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
171 changes: 171 additions & 0 deletions cmd/diagnostic/PyTorch.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
package diagnostic

import (
"bytes"
"encoding/json"
"fmt"
"io/ioutil"
"net/http"
"os"
"os/exec"
"regexp"
"strconv"
"strings"

"github.com/spf13/cobra"
)

func getPodMachineID(podID, apiKey string) string {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this function should return an error.

url := fmt.Sprintf("https://api.runpod.io/graphql?api_key=%s", apiKey)
headers := map[string]string{
"Content-Type": "application/json",
}
query := `
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OPTIONAL: this can be a const.

query Pod($podId: String!) {
pod(input: { podId: $podId }) {
machineId
}
}
`
data := map[string]interface{}{
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

MINOR: General go HTTP/REST performance tips:

  • use a struct here rather than map[string]any to avoid extraneous allocations.
  • don't make a map to iterate through the headers: just do req.Header.Set("Content-Type", "application/json")
  • don't make a new http.Client for each request: they're safe to share between invocations. if you don't care which client to use, use http.DefaultClient
  • either or both of the client or request context should have a timeout.

"query": query,
"variables": map[string]string{"podId": podID},
}
jsonData, _ := json.Marshal(data)

req, _ := http.NewRequest("POST", url, bytes.NewBuffer(jsonData))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

check this error and return it.

for k, v := range headers {
req.Header.Set(k, v)
}

client := &http.Client{}
resp, err := client.Do(req)
if err != nil {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use return fmt.Errorf with %w instead of just printing the error.

fmt.Printf("Failed to fetch machineId: %v\n", err)
return ""
}
defer resp.Body.Close()

var result map[string]interface{}
json.NewDecoder(resp.Body).Decode(&result)
if pod, ok := result["data"].(map[string]interface{})["pod"].(map[string]interface{}); ok {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

again, don't use map[string]any: just make a struct type that's the shape you want. in this case, it would be

type Result struct { 
    Data struct  { 
        Pod struct { 
            MachineID string  `json:"machineId"` 
         } `json:"pod"`
    } `json:"data"`
 }

return pod["machineId"].(string)
}
return ""
}

func collectEnvInfo() map[string]string {
ef0xa marked this conversation as resolved.
Show resolved Hide resolved
fmt.Println("Collecting environment information...")
envInfo := map[string]string{
"RUNPOD_POD_ID": os.Getenv("RUNPOD_POD_ID"),
"Template CUDA_VERSION": os.Getenv("CUDA_VERSION"),
"NVIDIA_DRIVER_CAPABILITIES": os.Getenv("NVIDIA_DRIVER_CAPABILITIES"),
"NVIDIA_VISIBLE_DEVICES": os.Getenv("NVIDIA_VISIBLE_DEVICES"),
"NVIDIA_PRODUCT_NAME": os.Getenv("NVIDIA_PRODUCT_NAME"),
"RUNPOD_GPU_COUNT": os.Getenv("RUNPOD_GPU_COUNT"),
"machineId": getPodMachineID(os.Getenv("RUNPOD_POD_ID"), os.Getenv("RUNPOD_API_KEY")),
}
for k, v := range envInfo {
if v == "" {
envInfo[k] = "Not Available"
}
}
return envInfo
}

func parseNvidiaSMIOutput(output string) map[string]string {
cudaVersionRegex := regexp.MustCompile(`CUDA Version: (\d+\.\d+)`)
driverVersionRegex := regexp.MustCompile(`Driver Version: (\d+\.\d+\.\d+)`)
gpuNameRegex := regexp.MustCompile(`\|\s+\d+\s+([^\|]+?)\s+On\s+\|`)

cudaVersion := cudaVersionRegex.FindStringSubmatch(output)
driverVersion := driverVersionRegex.FindStringSubmatch(output)
gpuName := gpuNameRegex.FindStringSubmatch(output)

info := map[string]string{
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(MINOR, OPTIONAL): see above re: structs vs maps

"CUDA Version": "Not Available",
"Driver Version": "Not Available",
"GPU Name": "Not Available",
}

if len(cudaVersion) > 1 {
info["CUDA Version"] = cudaVersion[1]
}
if len(driverVersion) > 1 {
info["Driver Version"] = driverVersion[1]
}
if len(gpuName) > 1 {
info["GPU Name"] = strings.TrimSpace(gpuName[1])
}

return info
}

func getNvidiaSMIInfo() map[string]string {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(OPTIONAL, MINOR): this function is tiny and only called once. inline it.

cmd := exec.Command("nvidia-smi")
output, err := cmd.Output()
if err != nil {
return map[string]string{"Error": fmt.Sprintf("Failed to fetch nvidia-smi info: %v", err)}
}
return parseNvidiaSMIOutput(string(output))
}

func getSystemInfo() map[string]interface{} {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(OPTIONAL, MINOR): this function is tiny and only called once. inline it.

systemInfo := map[string]interface{}{
"Environment Info": collectEnvInfo(),
"Host Machine Info": getNvidiaSMIInfo(),
}
return systemInfo
}

func runCUDATest() map[string]string {
fmt.Println("Performing CUDA operation tests on all available GPUs...")
gpuCount := 0
if count, err := strconv.Atoi(os.Getenv("RUNPOD_GPU_COUNT")); err == nil {
gpuCount = count
}
results := make(map[string]string)

if gpuCount == 0 {
return map[string]string{"Error": "No GPUs found."}
}

for gpuID := 0; gpuID < gpuCount; gpuID++ {
cmd := exec.Command("python", "-c", fmt.Sprintf(`
import torch
device = torch.device('cuda:%d')
torch.cuda.set_device(device)
x = torch.rand(10, 10, device=device)
y = torch.rand(10, 10, device=device)
z = x + y
print("Success: CUDA is working correctly.")
`, gpuID))
output, err := cmd.CombinedOutput()
if err != nil {
results[fmt.Sprintf("GPU %d", gpuID)] = fmt.Sprintf("Error: %v", err)
} else {
results[fmt.Sprintf("GPU %d", gpuID)] = strings.TrimSpace(string(output))
}
}

return results
}

func saveInfoToFile(info map[string]interface{}, filename string) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(OPTIONAL, MINOR): this function is tiny and only called once. inline it.

jsonData, _ := json.MarshalIndent(info, "", " ")
ioutil.WriteFile(filename, jsonData, 0644)
fmt.Printf("Diagnostics information saved to %s. Please share this file with RunPod Tech Support for further assistance.\n", filename)
}

// Cobra command
var GpuDiagnosticsCmd = &cobra.Command{
Use: "PyTorch",
Short: "Run PyTorch CUDA test",
Long: `This command performs a series of diagnostics tests on the GPUs available in your system for RunPod.`,
Run: func(cmd *cobra.Command, args []string) {
fmt.Println("RunPod GPU Diagnostics Tool")
systemInfo := getSystemInfo()
systemInfo["CUDA Test Result"] = runCUDATest()
saveInfoToFile(systemInfo, "/workspace/gpu_diagnostics.json")
},
}
17 changes: 17 additions & 0 deletions cmd/gpu_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
package cmd

import (
"cmd/diagnostic"

"github.com/spf13/cobra"
)

var gpuTestCmd = &cobra.Command{
Use: "gpu-test",
Short: "GPU test commands",
Long: "Commands for testing GPU functionality",
}

func init() {
gpuTestCmd.AddCommand(diagnostic.GpuDiagnosticsCmd)
}
3 changes: 3 additions & 0 deletions cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ func registerCommands() {
rootCmd.AddCommand(updateCmd)
rootCmd.AddCommand(sshCmd)

//Diagnostic tools
rootCmd.AddCommand(gpuTestCmd)

// Remote File Execution
rootCmd.AddCommand(execCmd)

Expand Down