Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 38 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ and Code editors

- Python 3.x
- ADB (Android Debug Bridge) installed and configured
- Tesseract OCR engine (for text detection features)
- Android device or emulator (not tested)

## Installation
Expand All @@ -29,7 +30,20 @@ git clone https://github.com/minhalvp/android-mcp-server.git
cd android-mcp-server
```

2. Install dependencies:
2. Install Tesseract OCR engine:

```bash
# Ubuntu/Debian
sudo apt install tesseract-ocr

# macOS (with Homebrew)
brew install tesseract

# Windows
# Download installer from: https://github.com/UB-Mannheim/tesseract/wiki
```

3. Install Python dependencies:
This project uses [uv](https://github.com/astral-sh/uv) for project
management via various methods of
[installation](https://docs.astral.sh/uv/getting-started/installation/).
Expand Down Expand Up @@ -193,6 +207,29 @@ def get_screenshot() -> Image:
"""
```

```python
def get_screenshot_ocr(search_string: str, confidence_threshold: float = 60.0) -> list[dict]:
"""
Search a screenshot for text and return the locations of it.
Args:
search_string: Text string to search for in the screenshot
confidence_threshold: Minimum OCR confidence threshold 0-100 (default: 60)
Returns:
List of dictionaries containing text matches with coordinates
"""
```

```python
def get_screenshot_text(confidence_threshold: float = 30.0) -> list[dict]:
"""
Get all text from a screenshot using OCR.
Args:
confidence_threshold: Minimum OCR confidence threshold 0-100 (default: 30)
Returns:
List of dictionaries containing all detected text with coordinates
"""
```

```python
def get_package_action_intents(package_name: str) -> list[str]:
"""
Expand Down
72 changes: 72 additions & 0 deletions adbdevicemanager.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import os
import subprocess
import sys
from typing import List, Dict, Optional

import pytesseract
from PIL import Image as PILImage
from ppadb.client import Client as AdbClient

Expand Down Expand Up @@ -139,6 +141,76 @@ def take_screenshot(self) -> None:
resized_img.save(
"compressed_screenshot.png", "PNG", quality=85, optimize=True
)
def ocr_on_screenshot(self, search_string: str = None, confidence_threshold: float = 60.0) -> List[Dict]:
"""
Perform OCR on screenshot and return text locations.

Args:
search_string: Text to search for (if None, returns all text)
confidence_threshold: Minimum OCR confidence threshold 0-100

Returns:
List of dictionaries containing text matches with coordinates
"""
self.device.shell("screencap -p /sdcard/screenshot.png")
self.device.pull("/sdcard/screenshot.png", "screenshot.png")
self.device.shell("rm /sdcard/screenshot.png")

try:
# Load image for OCR processing
image = PILImage.open("screenshot.png")
original_width, original_height = image.size

# Get OCR data with bounding boxes
ocr_data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)

results = []

# Process each detected text element
for i in range(len(ocr_data['text'])):
text = ocr_data['text'][i].strip()
confidence = float(ocr_data['conf'][i])

# Skip low confidence or empty text
if confidence < confidence_threshold or not text:
continue

# Check if search text matches (case insensitive)
if search_string is None:
# Return all text if no search string provided
include_text = True
else:
include_text = search_string.lower() in text.lower()

if include_text:
# Get bounding box coordinates
left = ocr_data['left'][i]
top = ocr_data['top'][i]
width = ocr_data['width'][i]
height = ocr_data['height'][i]

# Calculate center coordinates
center_x = left + width // 2
center_y = top + height // 2

result = {
"text": text,
"center_x": center_x,
"center_y": center_y,
"confidence": confidence,
"bbox": {
"left": left,
"top": top,
"width": width,
"height": height
}
}
results.append(result)

return results

except Exception as e:
raise Exception(f"Error processing OCR: {str(e)}")

def get_uilayout(self) -> str:
self.device.shell("uiautomator dump")
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ dependencies = [
"pure-python-adb>=0.3.0.dev0",
"PyYAML>=6.0",
"Pillow>=10.0.0",
"pytesseract>=0.3.0",
]

[project.optional-dependencies]
Expand Down
51 changes: 51 additions & 0 deletions server.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,57 @@ def get_screenshot() -> Image:
deviceManager.take_screenshot()
return Image(path="compressed_screenshot.png")

@mcp.tool()
def get_screenshot_ocr(search_string: str, confidence_threshold: float = 60.0) -> list[dict]:
"""Search a screenshot for text and return the locations of it.

Args:
search_string: Text string to search for in the screenshot
confidence_threshold: Minimum OCR confidence threshold 0-100 (default: 60)

Returns:
List of dictionaries containing text matches with coordinates:
{
"text": "matching_text",
"center_x": 420,
"center_y": 750,
"confidence": 87.5,
"bbox": {
"left": 380,
"top": 735,
"width": 80,
"height": 30
}
}
"""
detections = deviceManager.ocr_on_screenshot(search_string, confidence_threshold)
return detections

@mcp.tool()
def get_screenshot_text(confidence_threshold: float = 30.0) -> list[dict]:
"""Get all text from a screenshot using OCR.

Args:
confidence_threshold: Minimum OCR confidence threshold 0-100 (default: 30)

Returns:
List of dictionaries containing all detected text with coordinates:
{
"text": "detected_text",
"center_x": 420,
"center_y": 750,
"confidence": 87.5,
"bbox": {
"left": 380,
"top": 735,
"width": 80,
"height": 30
}
}
"""
detections = deviceManager.ocr_on_screenshot(search_string=None, confidence_threshold=confidence_threshold)
return detections


@mcp.tool()
def get_package_action_intents(package_name: str) -> list[str]:
Expand Down
Loading