minhalvp · jtabor · Sep 19, 2025 · Sep 19, 2025
diff --git a/README.md b/README.md
@@ -18,6 +18,7 @@ and Code editors
 
 - Python 3.x
 - ADB (Android Debug Bridge) installed and configured
+- Tesseract OCR engine (for text detection features)
 - Android device or emulator (not tested)
 
 ## Installation
@@ -29,7 +30,20 @@ git clone https://github.com/minhalvp/android-mcp-server.git
 cd android-mcp-server
 ```
 
-2. Install dependencies:
+2. Install Tesseract OCR engine:
+
+```bash
+# Ubuntu/Debian
+sudo apt install tesseract-ocr
+
+# macOS (with Homebrew)
+brew install tesseract
+
+# Windows
+# Download installer from: https://github.com/UB-Mannheim/tesseract/wiki
+```
+
+3. Install Python dependencies:
 This project uses [uv](https://github.com/astral-sh/uv) for project
 management via various methods of
 [installation](https://docs.astral.sh/uv/getting-started/installation/).
@@ -193,6 +207,29 @@ def get_screenshot() -> Image:
     """
 ```
 
+```python
+def get_screenshot_ocr(search_string: str, confidence_threshold: float = 60.0) -> list[dict]:
+    """
+    Search a screenshot for text and return the locations of it.
+    Args:
+        search_string: Text string to search for in the screenshot
+        confidence_threshold: Minimum OCR confidence threshold 0-100 (default: 60)
+    Returns:
+        List of dictionaries containing text matches with coordinates
+    """
+```
+
+```python
+def get_screenshot_text(confidence_threshold: float = 30.0) -> list[dict]:
+    """
+    Get all text from a screenshot using OCR.
+    Args:
+        confidence_threshold: Minimum OCR confidence threshold 0-100 (default: 30)
+    Returns:
+        List of dictionaries containing all detected text with coordinates
+    """
+```
+
 ```python
 def get_package_action_intents(package_name: str) -> list[str]:
     """

diff --git a/adbdevicemanager.py b/adbdevicemanager.py
@@ -1,7 +1,9 @@
 import os
 import subprocess
 import sys
+from typing import List, Dict, Optional
 
+import pytesseract
 from PIL import Image as PILImage
 from ppadb.client import Client as AdbClient
 
@@ -139,6 +141,76 @@ def take_screenshot(self) -> None:
             resized_img.save(
                 "compressed_screenshot.png", "PNG", quality=85, optimize=True
             )
+    def ocr_on_screenshot(self, search_string: str = None, confidence_threshold: float = 60.0) -> List[Dict]:
+        """
+        Perform OCR on screenshot and return text locations.
+
+        Args:
+            search_string: Text to search for (if None, returns all text)
+            confidence_threshold: Minimum OCR confidence threshold 0-100
+
+        Returns:
+            List of dictionaries containing text matches with coordinates
+        """
+        self.device.shell("screencap -p /sdcard/screenshot.png")
+        self.device.pull("/sdcard/screenshot.png", "screenshot.png")
+        self.device.shell("rm /sdcard/screenshot.png")
+
+        try:
+            # Load image for OCR processing
+            image = PILImage.open("screenshot.png")
+            original_width, original_height = image.size
+
+            # Get OCR data with bounding boxes
+            ocr_data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
+
+            results = []
+
+            # Process each detected text element
+            for i in range(len(ocr_data['text'])):
+                text = ocr_data['text'][i].strip()
+                confidence = float(ocr_data['conf'][i])
+
+                # Skip low confidence or empty text
+                if confidence < confidence_threshold or not text:
+                    continue
+
+                # Check if search text matches (case insensitive)
+                if search_string is None:
+                    # Return all text if no search string provided
+                    include_text = True
+                else:
+                    include_text = search_string.lower() in text.lower()
+
+                if include_text:
+                    # Get bounding box coordinates
+                    left = ocr_data['left'][i]
+                    top = ocr_data['top'][i]
+                    width = ocr_data['width'][i]
+                    height = ocr_data['height'][i]
+
+                    # Calculate center coordinates
+                    center_x = left + width // 2
+                    center_y = top + height // 2
+
+                    result = {
+                        "text": text,
+                        "center_x": center_x,
+                        "center_y": center_y,
+                        "confidence": confidence,
+                        "bbox": {
+                            "left": left,
+                            "top": top,
+                            "width": width,
+                            "height": height
+                        }
+                    }
+                    results.append(result)
+
+            return results
+
+        except Exception as e:
+            raise Exception(f"Error processing OCR: {str(e)}")
 
     def get_uilayout(self) -> str:
         self.device.shell("uiautomator dump")

diff --git a/pyproject.toml b/pyproject.toml
@@ -13,6 +13,7 @@ dependencies = [
     "pure-python-adb>=0.3.0.dev0",
     "PyYAML>=6.0",
     "Pillow>=10.0.0",
+    "pytesseract>=0.3.0",
 ]
 
 [project.optional-dependencies]

diff --git a/server.py b/server.py
@@ -94,6 +94,57 @@ def get_screenshot() -> Image:
     deviceManager.take_screenshot()
     return Image(path="compressed_screenshot.png")
 
+@mcp.tool()
+def get_screenshot_ocr(search_string: str, confidence_threshold: float = 60.0) -> list[dict]:
+    """Search a screenshot for text and return the locations of it.
+
+    Args:
+        search_string: Text string to search for in the screenshot
+        confidence_threshold: Minimum OCR confidence threshold 0-100 (default: 60)
+
+    Returns:
+        List of dictionaries containing text matches with coordinates:
+        {
+            "text": "matching_text",
+            "center_x": 420,
+            "center_y": 750,
+            "confidence": 87.5,
+            "bbox": {
+                "left": 380,
+                "top": 735,
+                "width": 80,
+                "height": 30
+            }
+        }
+    """
+    detections = deviceManager.ocr_on_screenshot(search_string, confidence_threshold)
+    return detections 
+
+@mcp.tool()
+def get_screenshot_text(confidence_threshold: float = 30.0) -> list[dict]:
+    """Get all text from a screenshot using OCR.
+
+    Args:
+        confidence_threshold: Minimum OCR confidence threshold 0-100 (default: 30)
+
+    Returns:
+        List of dictionaries containing all detected text with coordinates:
+        {
+            "text": "detected_text",
+            "center_x": 420,
+            "center_y": 750,
+            "confidence": 87.5,
+            "bbox": {
+                "left": 380,
+                "top": 735,
+                "width": 80,
+                "height": 30
+            }
+        }
+    """
+    detections = deviceManager.ocr_on_screenshot(search_string=None, confidence_threshold=confidence_threshold)
+    return detections
+
 
 @mcp.tool()
 def get_package_action_intents(package_name: str) -> list[str]: