Computer Use

Allow agents to interact with the computer through UI automation.

Overview

Computer Use enables agents to:

  • Simulate mouse clicks

  • Type keyboard input

  • Capture screenshots

  • Navigate UI elements

  • Automate interactions

⚠️ Warning: Computer Use provides powerful system access. Use with caution and proper permissions.

Basic Setup

Enable Computer Use

// Add computer use tool
agent.AddTool(ToolType.ComputerUse);

// Configure permissions
agent.Settings.ComputerUse = new ComputerUseSettings
{
    AllowMouseControl = true,
    AllowKeyboardControl = true,
    AllowScreenshots = true
};

Mouse Control

Mouse Operations

public class MouseControl : MonoBehaviour
{
    [SerializeField] private AgentBehaviour agent;
    
    void Start()
    {
        agent.AddTool("mouse_click", PerformClick);
        agent.AddTool("mouse_move", MoveMouse);
        agent.AddTool("mouse_drag", DragMouse);
    }
    
    string PerformClick(int x, int y, string button = "left")
    {
        try
        {
            // Move to position
            SetCursorPos(x, y);
            
            // Click
            uint mouseButton = button.ToLower() switch
            {
                "left" => MOUSEEVENTF_LEFTDOWN | MOUSEEVENTF_LEFTUP,
                "right" => MOUSEEVENTF_RIGHTDOWN | MOUSEEVENTF_RIGHTUP,
                "middle" => MOUSEEVENTF_MIDDLEDOWN | MOUSEEVENTF_MIDDLEUP,
                _ => MOUSEEVENTF_LEFTDOWN | MOUSEEVENTF_LEFTUP
            };
            
            mouse_event(mouseButton, 0, 0, 0, 0);
            
            return $"Clicked {button} button at ({x}, {y})";
        }
        catch (Exception ex)
        {
            return $"Error: {ex.Message}";
        }
    }
    
    string MoveMouse(int x, int y)
    {
        SetCursorPos(x, y);
        return $"Moved mouse to ({x}, {y})";
    }
    
    string DragMouse(int startX, int startY, int endX, int endY)
    {
        // Move to start
        SetCursorPos(startX, startY);
        
        // Press button
        mouse_event(MOUSEEVENTF_LEFTDOWN, 0, 0, 0, 0);
        
        // Drag to end
        SetCursorPos(endX, endY);
        
        // Release button
        mouse_event(MOUSEEVENTF_LEFTUP, 0, 0, 0, 0);
        
        return $"Dragged from ({startX}, {startY}) to ({endX}, {endY})";
    }
    
    // Windows API
    [System.Runtime.InteropServices.DllImport("user32.dll")]
    static extern bool SetCursorPos(int x, int y);
    
    [System.Runtime.InteropServices.DllImport("user32.dll")]
    static extern void mouse_event(uint dwFlags, int dx, int dy, uint dwData, int dwExtraInfo);
    
    const uint MOUSEEVENTF_LEFTDOWN = 0x0002;
    const uint MOUSEEVENTF_LEFTUP = 0x0004;
    const uint MOUSEEVENTF_RIGHTDOWN = 0x0008;
    const uint MOUSEEVENTF_RIGHTUP = 0x0010;
    const uint MOUSEEVENTF_MIDDLEDOWN = 0x0020;
    const uint MOUSEEVENTF_MIDDLEUP = 0x0040;
}

Keyboard Control

Keyboard Input

public class KeyboardControl : MonoBehaviour
{
    [SerializeField] private AgentBehaviour agent;
    
    void Start()
    {
        agent.AddTool("type_text", TypeText);
        agent.AddTool("press_key", PressKey);
        agent.AddTool("key_combination", KeyCombination);
    }
    
    string TypeText(string text)
    {
        try
        {
            foreach (char c in text)
            {
                SendKey(c);
                Thread.Sleep(10); // Small delay between keys
            }
            
            return $"Typed: {text}";
        }
        catch (Exception ex)
        {
            return $"Error: {ex.Message}";
        }
    }
    
    string PressKey(string key)
    {
        try
        {
            KeyCode keyCode = ParseKeyCode(key);
            SimulateKeyPress(keyCode);
            
            return $"Pressed key: {key}";
        }
        catch (Exception ex)
        {
            return $"Error: {ex.Message}";
        }
    }
    
    string KeyCombination(string modifier, string key)
    {
        try
        {
            KeyCode modifierKey = ParseKeyCode(modifier);
            KeyCode mainKey = ParseKeyCode(key);
            
            // Press modifier
            SimulateKeyDown(modifierKey);
            
            // Press main key
            SimulateKeyPress(mainKey);
            
            // Release modifier
            SimulateKeyUp(modifierKey);
            
            return $"Pressed {modifier}+{key}";
        }
        catch (Exception ex)
        {
            return $"Error: {ex.Message}";
        }
    }
    
    void SendKey(char c)
    {
        // Implementation depends on platform
        // Windows: SendKeys, user32.dll
        // Mac: CGEventPost
        // Linux: xdotool
    }
    
    KeyCode ParseKeyCode(string key)
    {
        return key.ToLower() switch
        {
            "enter" => KeyCode.Return,
            "space" => KeyCode.Space,
            "tab" => KeyCode.Tab,
            "esc" or "escape" => KeyCode.Escape,
            "ctrl" or "control" => KeyCode.LeftControl,
            "shift" => KeyCode.LeftShift,
            "alt" => KeyCode.LeftAlt,
            _ => (KeyCode)System.Enum.Parse(typeof(KeyCode), key, true)
        };
    }
    
    void SimulateKeyPress(KeyCode key) { /* Implementation */ }
    void SimulateKeyDown(KeyCode key) { /* Implementation */ }
    void SimulateKeyUp(KeyCode key) { /* Implementation */ }
}

Screenshot Capture

Screen Capture

public class ScreenCapture : MonoBehaviour
{
    [SerializeField] private AgentBehaviour agent;
    [SerializeField] private string screenshotPath = "Screenshots";
    
    void Start()
    {
        agent.AddTool("take_screenshot", TakeScreenshot);
        agent.AddTool("capture_region", CaptureRegion);
    }
    
    string TakeScreenshot()
    {
        try
        {
            string fileName = $"screenshot_{DateTime.Now:yyyyMMdd_HHmmss}.png";
            string fullPath = Path.Combine(screenshotPath, fileName);
            
            // Ensure directory exists
            Directory.CreateDirectory(screenshotPath);
            
            // Capture
            UnityEngine.ScreenCapture.CaptureScreenshot(fullPath);
            
            return $"Screenshot saved: {fullPath}";
        }
        catch (Exception ex)
        {
            return $"Error: {ex.Message}";
        }
    }
    
    string CaptureRegion(int x, int y, int width, int height)
    {
        try
        {
            // Create texture for region
            Texture2D screenshot = new Texture2D(width, height, TextureFormat.RGB24, false);
            
            // Read pixels from screen
            screenshot.ReadPixels(new Rect(x, y, width, height), 0, 0);
            screenshot.Apply();
            
            // Save
            byte[] bytes = screenshot.EncodeToPNG();
            string fileName = $"region_{DateTime.Now:yyyyMMdd_HHmmss}.png";
            string fullPath = Path.Combine(screenshotPath, fileName);
            
            File.WriteAllBytes(fullPath, bytes);
            
            Destroy(screenshot);
            
            return $"Region captured: {fullPath}";
        }
        catch (Exception ex)
        {
            return $"Error: {ex.Message}";
        }
    }
}

UI Element Detection

Find UI Elements

public class UIDetection : MonoBehaviour
{
    [SerializeField] private AgentBehaviour agent;
    
    void Start()
    {
        agent.AddTool("find_button", FindButton);
        agent.AddTool("find_element", FindElement);
        agent.AddTool("get_element_info", GetElementInfo);
    }
    
    string FindButton(string buttonText)
    {
        var buttons = FindObjectsOfType<Button>();
        
        foreach (var button in buttons)
        {
            var text = button.GetComponentInChildren<TMP_Text>();
            if (text != null && text.text.Contains(buttonText))
            {
                var pos = RectTransformUtility.WorldToScreenPoint(
                    Camera.main,
                    button.transform.position
                );
                
                return $"Found button at ({pos.x}, {pos.y})";
            }
        }
        
        return $"Button with text '{buttonText}' not found";
    }
    
    string FindElement(string elementName)
    {
        var element = GameObject.Find(elementName);
        
        if (element == null)
        {
            return $"Element '{elementName}' not found";
        }
        
        var rectTransform = element.GetComponent<RectTransform>();
        if (rectTransform != null)
        {
            var pos = RectTransformUtility.WorldToScreenPoint(
                Camera.main,
                rectTransform.position
            );
            
            return JsonUtility.ToJson(new
            {
                name = element.name,
                position = pos,
                size = rectTransform.sizeDelta,
                active = element.activeSelf
            });
        }
        
        return $"Element found but no RectTransform";
    }
    
    string GetElementInfo(string elementName)
    {
        var element = GameObject.Find(elementName);
        
        if (element == null)
        {
            return $"Element '{elementName}' not found";
        }
        
        var info = new
        {
            name = element.name,
            active = element.activeSelf,
            layer = LayerMask.LayerToName(element.layer),
            tag = element.tag,
            components = element.GetComponents<Component>()
                .Select(c => c.GetType().Name)
                .ToArray()
        };
        
        return JsonUtility.ToJson(info);
    }
}

Automation Workflows

Multi-Step Actions

public class AutomationWorkflow : MonoBehaviour
{
    [SerializeField] private AgentBehaviour agent;
    
    void Start()
    {
        agent.AddTool("fill_form", FillForm);
        agent.AddTool("login_sequence", LoginSequence);
    }
    
    async UniTask<string> FillForm(string formData)
    {
        try
        {
            var data = JsonUtility.FromJson<FormData>(formData);
            
            // Click first field
            await ClickElement("NameField");
            await UniTask.Delay(100);
            
            // Type name
            TypeText(data.name);
            await UniTask.Delay(100);
            
            // Tab to next field
            PressKey("Tab");
            await UniTask.Delay(100);
            
            // Type email
            TypeText(data.email);
            await UniTask.Delay(100);
            
            // Click submit
            await ClickElement("SubmitButton");
            
            return "Form filled successfully";
        }
        catch (Exception ex)
        {
            return $"Error: {ex.Message}";
        }
    }
    
    async UniTask<string> LoginSequence(string username, string password)
    {
        try
        {
            // Click username field
            await ClickElement("UsernameField");
            await UniTask.Delay(100);
            
            // Type username
            TypeText(username);
            await UniTask.Delay(100);
            
            // Tab to password
            PressKey("Tab");
            await UniTask.Delay(100);
            
            // Type password
            TypeText(password);
            await UniTask.Delay(100);
            
            // Press Enter
            PressKey("Enter");
            
            return "Login sequence completed";
        }
        catch (Exception ex)
        {
            return $"Error: {ex.Message}";
        }
    }
    
    async UniTask ClickElement(string elementName)
    {
        var element = GameObject.Find(elementName);
        if (element != null)
        {
            var button = element.GetComponent<Button>();
            button?.onClick.Invoke();
        }
        await UniTask.Yield();
    }
    
    void TypeText(string text) { /* Implementation */ }
    void PressKey(string key) { /* Implementation */ }
    
    [System.Serializable]
    class FormData
    {
        public string name;
        public string email;
    }
}

Safety & Permissions

Permission System

public class ComputerUsePermissions : MonoBehaviour
{
    [SerializeField] private bool allowMouseControl = false;
    [SerializeField] private bool allowKeyboardControl = false;
    [SerializeField] private bool allowScreenshots = true;
    [SerializeField] private bool requireConfirmation = true;
    
    public bool CanPerformAction(string actionType)
    {
        return actionType switch
        {
            "mouse" => allowMouseControl,
            "keyboard" => allowKeyboardControl,
            "screenshot" => allowScreenshots,
            _ => false
        };
    }
    
    public async UniTask<bool> RequestPermission(string action)
    {
        if (!requireConfirmation)
            return true;
        
        // Show permission dialog
        Debug.Log($"🔐 Permission requested: {action}");
        
        // Wait for user response
        await UniTask.Delay(100);
        
        return true; // For example
    }
}

Best Practices

1. Always Validate Actions

async UniTask<string> SafeExecute(string action, Func<UniTask<string>> execute)
{
    // Check permissions
    if (!permissions.CanPerformAction(action))
    {
        return "Error: Permission denied";
    }
    
    // Request confirmation if needed
    if (!await permissions.RequestPermission(action))
    {
        return "Error: User denied permission";
    }
    
    // Execute
    try
    {
        return await execute();
    }
    catch (Exception ex)
    {
        return $"Error: {ex.Message}";
    }
}

2. Add Delays

// Allow UI to respond
await UniTask.Delay(100);

// Between keystrokes
await UniTask.Delay(50);

3. Handle Failures

int retries = 3;
for (int i = 0; i < retries; i++)
{
    try
    {
        return await PerformAction();
    }
    catch (Exception ex)
    {
        if (i == retries - 1) throw;
        await UniTask.Delay(1000);
    }
}

Complete Example

using UnityEngine;
using Glitch9.AIDevKit.Agents;
using Cysharp.Threading.Tasks;

public class ComputerUseManager : MonoBehaviour
{
    [SerializeField] private AgentBehaviour agent;
    [SerializeField] private ComputerUsePermissions permissions;
    
    void Start()
    {
        RegisterTools();
    }
    
    void RegisterTools()
    {
        agent.AddTool("click", SafeClick);
        agent.AddTool("type", SafeType);
        agent.AddTool("screenshot", SafeScreenshot);
        agent.AddTool("find_and_click", FindAndClick);
    }
    
    async UniTask<string> SafeClick(int x, int y)
    {
        if (!permissions.CanPerformAction("mouse"))
        {
            return "Error: Mouse control not allowed";
        }
        
        if (!await permissions.RequestPermission("click"))
        {
            return "Error: Permission denied";
        }
        
        try
        {
            PerformClick(x, y);
            return $"Clicked at ({x}, {y})";
        }
        catch (Exception ex)
        {
            return $"Error: {ex.Message}";
        }
    }
    
    async UniTask<string> SafeType(string text)
    {
        if (!permissions.CanPerformAction("keyboard"))
        {
            return "Error: Keyboard control not allowed";
        }
        
        if (!await permissions.RequestPermission("type"))
        {
            return "Error: Permission denied";
        }
        
        try
        {
            foreach (char c in text)
            {
                TypeCharacter(c);
                await UniTask.Delay(50);
            }
            
            return $"Typed: {text}";
        }
        catch (Exception ex)
        {
            return $"Error: {ex.Message}";
        }
    }
    
    string SafeScreenshot()
    {
        if (!permissions.CanPerformAction("screenshot"))
        {
            return "Error: Screenshots not allowed";
        }
        
        try
        {
            string path = $"screenshot_{DateTime.Now:yyyyMMdd_HHmmss}.png";
            UnityEngine.ScreenCapture.CaptureScreenshot(path);
            return $"Screenshot: {path}";
        }
        catch (Exception ex)
        {
            return $"Error: {ex.Message}";
        }
    }
    
    async UniTask<string> FindAndClick(string elementName)
    {
        if (!permissions.CanPerformAction("mouse"))
        {
            return "Error: Mouse control not allowed";
        }
        
        try
        {
            var element = GameObject.Find(elementName);
            if (element == null)
            {
                return $"Element '{elementName}' not found";
            }
            
            var button = element.GetComponent<Button>();
            if (button != null)
            {
                if (!await permissions.RequestPermission($"click {elementName}"))
                {
                    return "Error: Permission denied";
                }
                
                button.onClick.Invoke();
                return $"Clicked {elementName}";
            }
            
            return "Element is not a button";
        }
        catch (Exception ex)
        {
            return $"Error: {ex.Message}";
        }
    }
    
    void PerformClick(int x, int y) { /* Implementation */ }
    void TypeCharacter(char c) { /* Implementation */ }
}

Next Steps

Last updated