Skip to content

Box-drawing utilities

conatus.utils.browser.post_processing.draw

🎨 Utilities to draw boxes on a webpage.

Annotated screenshots help the LLM think. We have observed that an optimal approach to get LLMs to navigate the web efficiently is to give them annotated screenshots. These screenshots feature a box, with a number, around every part of the page that the LLM can act upon: input elements and clickable elements.

  • In practice, such screenshots will look like this: How the DOM is processed

This module only draws boxes. You probably will want to use the screenshot utilities to retrieve an image of the webpage with these boxes.

from conatus.utils.browser import Browser
from conatus.utils.browser.post_processing import screenshot
from conatus.utils.browser.post_processing import draw


url = "https://inputtypes.com"
browser = Browser()
browser.goto(url)
page = browser.page
ss = page.last_screenshot
ss.save("tests/tmp/ss_no_boxes.png")
# Check it out screenshot doesn't feature any boxes

draw.draw_boxes(page)
ss, ss_b64 = screenshot.get_screenshots(page)
ss.save("tests/tmp/ss_boxes.png")
# Now ss features boxes

Additional references

NodeInputClick

Bases: TypedDict

Type for input and clickable nodes.

BBoxDict

Bases: TypedDict

Type for a bounding box.

draw_boxes_async async

draw_boxes_async(
    page: Page, *, step: None = None, chaos: bool = False
) -> None
draw_boxes_async(
    page: None = None, *, step: Step, chaos: bool = False
) -> None
draw_boxes_async(
    page: Page | None = None,
    *,
    step: Step | None = None,
    chaos: bool = False
) -> None

Draw the boxes on the page.

The drawing is performed in the background. When a new page is loaded, the boxes will disappear.

PARAMETER DESCRIPTION
page

The page to draw on.

TYPE: Page | None DEFAULT: None

step

The step to draw on. Note that page and step are mutually exclusive.

TYPE: Step | None DEFAULT: None

chaos

Whether to artificially crash the drawing. Here for testing. Defaults to False.

TYPE: bool DEFAULT: False

RAISES DESCRIPTION
ValueError

If we detect an error in the bounding boxes list (e.g. the bounding boxes list is None or its length is not 4)

Source code in conatus/utils/browser/post_processing/draw.py
async def draw_boxes_async(
    page: "Page | None" = None,
    *,
    step: "Step | None" = None,
    chaos: bool = False,
) -> None:
    """Draw the boxes on the page.

    The drawing is performed in the background. When a new page is loaded,
    the boxes will disappear.

    Args:
        page: The page to draw on.
        step: The step to draw on. Note that `page` and `step` are mutually
            exclusive.
        chaos: Whether to artificially crash the drawing. Here for
            testing. Defaults to False.

    Raises:
        ValueError: If we detect an error in the bounding boxes list (e.g.
            the bounding boxes list is None or its length is not 4)
    """
    match page, step:
        case None, None:
            msg = "Either `page` or `step` must be provided."
            raise ValueError(msg)
        case None, _:
            step = cast("Step", step)  # pyright: ignore[reportUnnecessaryCast]
            all_nodes = get_input_click_nodes(
                step.input_elements, step.clickable_elements
            )
            pw_page = step.pw_page
            step.input_click_nodes = all_nodes
        case _, None:
            page = cast("Page", page)  # pyright: ignore[reportUnnecessaryCast]
            all_nodes = get_input_click_nodes(
                page.last_step.input_elements, page.last_step.clickable_elements
            )
            pw_page = page.pw_page
        case _, _:  # pragma: no branch
            msg = "Either `page` or `step` must be provided, not both."
            raise ValueError(msg)

    bounds_and_index_list = [
        (index, node["node"].bounds) for index, node in all_nodes.items()
    ]

    b_boxes: dict[int, BBoxDict] = {}
    for index, bounds in bounds_and_index_list:
        if bounds is None:  # pragma: no cover
            # This is is dependent on the Playwright page being initialized
            # correctly, and therefore is hard to test.
            msg = (
                f"We have a problem with our b_boxes list:"
                f" {bounds_and_index_list}"
            )
            logger.error(msg)
            raise ValueError(msg)
        rect_object = _bbox_to_dict(bounds)
        b_boxes[index] = rect_object

    await _load_mark_page_script(pw_page)
    await _attempt_draw_boxes(pw_page, b_boxes, chaos=chaos)

draw_boxes

draw_boxes(page: Page, *, chaos: bool = False) -> None

Draw the boxes on the page.

More information can be found in the docstring of [draw_boxes_async](

conatus.utils.browser.post_processing.draw.draw_boxes_async), the async

sibling of this function.

PARAMETER DESCRIPTION
page

The page to draw on.

TYPE: Page

chaos

Whether to artificially crash the drawing. Here for testing. Defaults to False.

TYPE: bool DEFAULT: False

Source code in conatus/utils/browser/post_processing/draw.py
def draw_boxes(page: "Page", *, chaos: bool = False) -> None:
    """Draw the boxes on the page.

    More information can be found in the docstring of [`draw_boxes_async`](
    #conatus.utils.browser.post_processing.draw.draw_boxes_async), the async
    sibling of this function.

    Args:
        page: The page to draw on.
        chaos: Whether to artificially crash the drawing. Here for
            testing. Defaults to False.
    """
    run_async(draw_boxes_async(page, chaos=chaos), loop=page.pw_page._loop)  # noqa: SLF001 # pyright: ignore[reportPrivateUsage, reportAny]

undraw_boxes_async async

undraw_boxes_async(page: Page) -> None

Undraw any boxes that have been drawn on the page.

PARAMETER DESCRIPTION
page

The page to draw on.

TYPE: Page

Source code in conatus/utils/browser/post_processing/draw.py
async def undraw_boxes_async(page: "Page") -> None:
    """Undraw any boxes that have been drawn on the page.

    Args:
        page: The page to draw on.
    """
    await _load_mark_page_script(page.pw_page)
    await page.pw_page.evaluate("unmarkPage()")

undraw_boxes

undraw_boxes(page: Page) -> None

Undraw any boxes that have been drawn on the page.

More information can be found in the docstring of [undraw_boxes_async](

conatus.utils.browser.post_processing.draw.undraw_boxes_async), the async

sibling of this function.

PARAMETER DESCRIPTION
page

The page to draw on.

TYPE: Page

Source code in conatus/utils/browser/post_processing/draw.py
def undraw_boxes(page: "Page") -> None:
    """Undraw any boxes that have been drawn on the page.

    More information can be found in the docstring of [`undraw_boxes_async`](
    #conatus.utils.browser.post_processing.draw.undraw_boxes_async), the async
    sibling of this function.

    Args:
        page: The page to draw on.
    """
    run_async(undraw_boxes_async(page), loop=page.pw_page._loop)  # noqa: SLF001 # pyright: ignore[reportPrivateUsage, reportAny]

get_input_click_nodes

get_input_click_nodes(
    inputs: dict[int, DOMNode],
    clickables: dict[int, DOMNode],
) -> dict[int, NodeInputClick]

Convert input and clickable nodes to a list that is easier to process.

PARAMETER DESCRIPTION
inputs

The input nodes.

TYPE: dict[int, DOMNode]

clickables

The clickable nodes.

TYPE: dict[int, DOMNode]

RETURNS DESCRIPTION
NodeInputClick

A tuple of the form (node, is_input, is_clickable).

TYPE: dict[int, NodeInputClick]

Source code in conatus/utils/browser/post_processing/draw.py
def get_input_click_nodes(
    inputs: dict[int, DOMNode], clickables: dict[int, DOMNode]
) -> dict[int, NodeInputClick]:
    """Convert input and clickable nodes to a list that is easier to process.

    Args:
        inputs: The input nodes.
        clickables: The clickable nodes.

    Returns:
        NodeInputClick: A tuple of the form `(node, is_input, is_clickable)`.
    """
    all_nodes: dict[int, NodeInputClick] = {}
    for i, k in enumerate(inputs.keys() | clickables.keys()):
        # Testing the presence of the key in both dictionaries
        node_inp, node_click = inputs.get(k), clickables.get(k)
        # Handling the case where a node is both inputable and clickable
        if node_inp and node_click:
            all_nodes[i] = {
                "node": node_inp,
                "is_input": True,
                "is_clickable": True,
            }
        # TODO(lemeb): Find a URL where there are nodes that are inputs
        # but are not clickable (is this a thing?)
        # CTUS-14
        elif node_inp:  # pragma: no cover
            all_nodes[i] = {
                "node": node_inp,
                "is_input": True,
                "is_clickable": False,
            }
        elif node_click:  # pragma: no branch
            all_nodes[i] = {
                "node": node_click,
                "is_input": False,
                "is_clickable": True,
            }
    return all_nodes

html_description_for_llm

html_description_for_llm(
    all_nodes: dict[int, NodeInputClick],
) -> str

Returns a string representation of the nodes optimized for LLMs.

Alternative to the approach in Globot. See DOMNode.__repr__ for more information.

PARAMETER DESCRIPTION
all_nodes

The nodes to describe.

TYPE: dict[int, NodeInputClick]

RETURNS DESCRIPTION
str

The string representation of the nodes.

TYPE: str

Source code in conatus/utils/browser/post_processing/draw.py
def html_description_for_llm(all_nodes: dict[int, NodeInputClick]) -> str:
    """Returns a string representation of the nodes optimized for LLMs.

    Alternative to the approach in Globot. See `DOMNode.__repr__` for
    more information.

    Args:
        all_nodes (dict[int, NodeInputClick]): The nodes to describe.

    Returns:
        str: The string representation of the nodes.
    """
    s = ""
    for i, node in all_nodes.items():
        attrs_exc = {"_uid", "uid", "node_name"}
        identifying_attributes = node["node"].identifying_attributes
        attrs_json = identifying_attributes.model_dump_json(exclude=attrs_exc)
        s += (
            f"<node id={i} clickable={node['is_clickable']}"
            f" inputable={node['is_input']}>\n"
            f"- ID: {node['node'].attributes.get('id', 'N/A')}\n"
            f"- On Click: {node['node'].attributes.get('onclick', 'N/A')}\n"
            f"- Attributes: {attrs_json}\n"
            f"- HTML: {node['node']!r}\n"
            "------\n"
        )
    return s