Source code for pygats.recog

"""
module with data classes.
"""

from dataclasses import dataclass
import re
from typing import Optional, Union
import hdbscan
import pyautogui
import pytesseract
import mss
import numpy as np
import cv2 as cv
from Levenshtein import ratio
from PIL import Image
from pygats.pygats import step, passed, failed



[docs]
@dataclass
class SearchedText:
    """
    Data class to store text content, language and crop area to be passed
    as parameters for Tesseract function
    """
    content: str
    lang: str
    area: str




[docs]
@dataclass
class ROI:
    """
    Data class to store coordinates of region of interest
    x (int), y (int): coordinates of top-left point of rectangle where
    text resides
    w (int), h (int): width and height of rectangle where text resides
    """
    x: int
    y: int
    w: int
    h: int


[docs]
    def rectangle_center_coords(self):
        """
        return center of the rectangle

        Returns:
                tuple: coordinates of the rectangle center
        """
        return self.x + self.w / 2, self.y + self.h / 2





[docs]
@dataclass
class KeypointsCluster:
    """
    Data class for storing a cluster of keypoints, labels, and rectangle coordinates.
    keypoints (list): A list of keypoints representing the cluster.
    labels (list): A list of labels associated with the keypoints.
    coord_rect (list): Coordinates of the rectangle that bounds the cluster.
                                     Expected format is (x_min, y_min, x_max, y_max).

    Methods:
        __repr__(): Returns a string representation of the KeypointCluster instance,
                    including keypoints, labels, and rectangle coordinates.
    """
    keypoints: list
    labels: list
    coord_rect: tuple


[docs]
    def __repr__(self):
        return (f"keypoints={self.keypoints,}\n"
                f"labels={self.labels}\n"
                f"coord_rect={self.coord_rect}")





[docs]
def find_cropped_text(ctx, img: Image, txt: SearchedText,
                      skip: Optional[int] = 0, one_word: Optional[bool] = False):
    """
    Find text in image. Several passes are used.
    First time found area with text on image and then
    every area passed through recognition again to improve recognition results

    Args:
        ctx (Context): An object that contains information about the current
                    context.
        img (Image): image to search text in
        txt (SearchedText): text to search
        skip (int, optional): number of occurrences of the text to skip.
        one_word (bool, optional): flag if only one word has been searched.

    Returns:
        (roi, found):
            roi(ROI): region of interest
            found (bool): whether the text is found in the image

    """
    recognized_data = pytesseract.image_to_data(img, txt.lang).split('\n')
    recognized_lines = combine_lines(recognized_data, one_word)
    roi, found = ROI(-1, -1, -1, -1), False
    for pos, content in recognized_lines:
        if content.find(txt.content) != -1:
            ctx.formatter.print_para('Найден текст ' + content)
            roi, found = pos, True
            if skip <= 0:
                break
            skip -= 1
    return roi, found




[docs]
def find_text_on_screen(ctx, txt, skip=0, one_word=False):
    """
    Function finds text on the screen

    Args:
        ctx (Context): An object that contains information about the current
                    context.
        txt (pygats.recog.SearchedText): text to find
        skip (int, optional): amount of findings which should be skipped
        one_word (bool, optional): search only one world

    Returns:
        (roi, found):
            roi(ROI): region of interest
            found (bool): whether the text is found in the image
    """
    step(ctx, f'Поиск текста {txt.content} на экране ...')
    with mss.mss() as sct:
        img = np.array(sct.grab(sct.monitors[0]))
        img = cv.cvtColor(img, cv.COLOR_BGR2RGB)
        img = Image.fromarray(img)
    roi, found = find_text(ctx, img, txt, skip, False, one_word)
    if found:
        return roi, found
    return find_text(ctx, img, txt, skip, True, one_word)




[docs]
def check_text(ctx, img: Image, txt):
    """Checks if text (txt) exists on image (img) printed with language (lang)

    Args:
        ctx (Context): An object that contains information about the current
                    context.
        img (Image): image to find text
        txt (pygats.recog.SearchedText): text to search

    """
    step(ctx,
         f'Проверка отображения текста {txt.content} на изображении {img}...')
    _, found = find_text(ctx, img, txt)
    if not found:
        _, found = find_text(ctx, img, txt, extend=True)
        if not found:
            failed(msg=f'{txt.content} не найден на изображении')
    passed(ctx)




[docs]
def check_text_on_screen(ctx, txt):
    """Checks if text (txt) exists on the screen

    Args:
        ctx (Context): An object that contains information about the current
                    context.
        txt (pygats.recog.SearchedText): text to search on screenshot
    """
    step(ctx, f'Проверка отображения текста {txt.content} на экране ...')
    with mss.mss() as sct:
        img = np.array(sct.grab(sct.monitors[0]))
        img = cv.cvtColor(img, cv.COLOR_BGR2RGB)
        img = Image.fromarray(img)
    _, found = find_text(ctx, img, txt)
    if not found:
        _, found = find_text(ctx, img, txt, extend=True)
        if not found:
            failed(msg=f'{txt.content} не найден на экране')
    passed(ctx)




[docs]
def move_to_text(ctx, txt, skip=0):
    """Finds text on the screen and moves the cursor to it

    Args:
        ctx (Context): An object that contains information about the current
                    context.
        txt (pygats.recog.SearchedText): text to be searched and clicked
        skip (int): amount of text should be skipped
    """
    step(ctx, f'Переместить курсор на текст {txt.content}')
    roi, found = find_text_on_screen(
        ctx, txt, skip, True)
    if not found:
        failed(msg=f'{txt.content} не найден на экране')
    ctx.formatter.print_para(f'{roi.x} {roi.y} {roi.w} {roi.h}')
    center_x, center_y = roi.rectangle_center_coords()
    pyautogui.moveTo(center_x, center_y)
    passed(ctx)




[docs]
def click_text(ctx, txt, button='left', skip=0):
    """Finds text on screen and press mouse button on it

    Args:
        ctx (Context): An object that contains information about the current
                    context.
        txt (pygats.recog.SearchedText): text to be searched and clicked
        button (string, optional): left, right, middle
        skip (int): amount of text should be skipped
    """
    step(ctx, f'Нажать текст {txt.content} на экране кнопкой {button}...')
    roi, found = find_text_on_screen(
        ctx, txt, skip, True)
    if not found:
        failed(msg=f'{txt.content} не найден на экране')

    ctx.formatter.print_para(f'{roi.x} {roi.y} {roi.w} {roi.h}')
    center_x, center_y = roi.rectangle_center_coords()
    pyautogui.moveTo(center_x, center_y)
    pyautogui.mouseDown(center_x, center_y, button)
    pyautogui.mouseUp(center_x, center_y, button)
    passed(ctx)




[docs]
def recognize_text_with_data(img, lang):
    """Functions recognize all texts on the image with Tesseract

    Args:
        img (PIL.Image): input image to recognize text
        lang (string): language in tesseract format

    Returns:
        list: recognized text
    """
    return pytesseract.image_to_data(img, lang)




[docs]
def combine_lines(lines, one_word=False):
    """Function translate lines from Tesseract output format into
    result tuple

    Args:
        lines (List): Returns result containing box boundaries, confidences,
            and other information.
        one_word (bool, optional): one word to search

    Returns:
        list: list of (ROI, text) tuples

    Notes:
        There is magic number 5 to understand if words on the same line.
        It should be reworked in the future.

    Todo:
        * This function should be reworked in future with
          combine_words_in_lines. Need one function to combine words in
          sentences.
    """
    result = []
    for i in range(1, len(lines) - 1):
        split_line_1 = lines[i].split('\t')
        if len(split_line_1) != 12:
            return result
        roi = ROI(*map(int, split_line_1[6:10]))
        text = split_line_1[11]
        if not one_word:
            for j in range(i + 1, len(lines) - 1):
                split_line_2 = lines[j].split('\t')
                if abs(roi.y - int(split_line_2[7])) < 5 and len(split_line_2[11].strip()) > 0:
                    roi.w += int(split_line_2[8])
                    text += ' ' + split_line_2[11]
        result.append((roi, text))
    return result




[docs]
def crop_image(img: Image, width: Optional[int] = 0, height: Optional[int] = 0,
               extend: Optional[bool] = False) -> Image:
    """
    Crops a portion of the input image based on the specified width and height multipliers.
    If width and height aren't specified return an original image

    Args:
        img (Image): The input image to crop.
        width (int, optional): The multiplier to determine the beginning of the crop area by width.
        height (int, optional): The multiplier to determine the beginning of the crop area by height
        extend (bool, optional): Whether to extend the crop area by a factor of 2.

    Returns:
        (x_offset, y_offset, img_crop):
            x_offset (int), y_offset (int): offset by x and y coordinates
            img_crop (Image): The cropped image area
    """
    img_width, img_height = img.size
    factor = 1
    if extend:
        crop_width = img_width // 4
        crop_height = img_height // 4
        factor = 2
    else:
        crop_width = img_width // 3
        crop_height = img_height // 3
    crop_coord = (crop_width * width,
                  crop_height * height,
                  crop_width * width + crop_width * factor,
                  crop_height * height + crop_height * factor)
    x_offset = crop_coord[0]
    y_offset = crop_coord[1]
    img_crop = img.crop(crop_coord)
    return x_offset, y_offset, img_crop




[docs]
def find_crop_image(img: Image, crop_area: Optional[str] = 'all',
                    extend: Optional[bool] = False) -> Image:
    """
    Detects the crop area for the input image and crops the image based on the specified crop area.

    Args:
        img (Image): The input image to crop.
        crop_area (str, optional): The crop area to use. Defaults to 'all'. # noqa: DAR003
        extend (bool, optional): Whether to extend the crop area by a factor of 2.
        Defaults to False.

    Returns:
        (x_offset, y_offset, img_crop):
            x_offset (int), y_offset (int): offset by x and y coordinates
            img_crop (Image): The cropped image area
    """
    crop_area_params = {
        'center': (img, 1, 1, extend),
        'top-left': (img, 0, 0, extend),
        'left': (img, 0, 1, extend),
        'bottom-left': (img, 0, 2, extend),
        'top': (img, 1, 0, extend),
        'bottom': (img, 1, 2, extend),
        'top-right': (img, 2, 0, extend),
        'right': (img, 2, 1, extend),
        'bottom-right': (img, 2, 2, extend)
    }
    return crop_image(*crop_area_params.get(crop_area)) if crop_area_params.get(crop_area)\
        else (0, 0, img)




[docs]
def find_text(ctx, img: Image, txt, skip=0, extend=False, one_word=False):  # pylint: disable=R0917
    """Function finds text in image with Tesseract

    Args:
        ctx (Context): An object that contains information about the current
                    context.
        img (Image): image where text will be recognized
        txt (pygats.recog.SearchedText): text which fill be searched
        skip (int): amount of skipped finding
        extend (bool, optional): extended crop area
        one_word (bool, optional): one word to search

    Returns:
        (roi,found):
            roi(ROI): region of interest
            found (bool): whether the text is found in the image
    """
    x_offset, y_offset, img = find_crop_image(img, txt.area, extend=extend)
    recognized = pytesseract.image_to_data(img, txt.lang).split('\n')
    lines = combine_lines(recognized, one_word)
    roi, found = ROI(-1, -1, -1, -1), False
    for pos, content in lines[1:]:
        if content.find(txt.content) != -1:
            ctx.formatter.print_para('Найден текст ' + content)
            roi = ROI(pos.x + x_offset, pos.y + y_offset, pos.w, pos.h)
            found = True
            if skip <= 0:
                break
            skip -= 1
        else:
            if pos.x + pos.y != 0:
                cropped = img.crop(
                    (pos.x, pos.y,
                        pos.x + pos.w,
                        pos.y + pos.h))
                roi, found = find_cropped_text(
                    ctx, cropped, txt, 0, one_word)
                if found:
                    return ROI(roi.x + pos.x, roi.y + pos.y, roi.w, roi.h), found
    return roi, found




[docs]
def recognize_text(img, lang):
    """Function recognizes text in image with Tesseract and combine
    lines to tuple and return lists

    Args:
        img (PIL.Image): image where text will be recognized
        lang (string): language of text (tesseract-ocr)

    Returns:
        (x,y,w,h,text):
            x (int), y (int): coordinates of top-left point of rectangle where
               text resides
            w (int), h (int): width and height of rectangle where text resides
            text (string): full text which resides in rectangle

    Notes:
        This is wrapper function to pytesseract.image_to_data. Results of
        image_to_data are combined to lines.
    """
    recognized_data = pytesseract.image_to_data(img, lang).split('\n')
    result = combine_lines(recognized_data)
    return list(set(result))




[docs]
def find_fuzzy_text(recognized_list, search: str):
    """Fuzzy search of text in list using Levenshtein ratio
    Return value is list of tuples with following format:

    Args:
        recognized_list (list[tuple]): list of text to match with pattern (format: ROI,text)
        search (str): substring to search

    Returns:
        (roi,text, substring):
            roi(ROI): region of interest
            text (str): full text which resides in rectangle
    """
    result = []
    search_len = len(search)
    for roi, content in recognized_list:
        r = ratio(search, content, score_cutoff=0.5)
        text = content
        if r > 0.0:
            result.append((roi, content))
        elif len(text) > search_len:
            for i in range(len(text) - search_len):
                slice_for_search = text[i:i + search_len]
                r = ratio(search, slice_for_search, score_cutoff=0.8)
                if r > 0.0:
                    result.append((roi, content))
    return list(set(result))




[docs]
def find_regexp_text(recognized_list: list, pattern):
    """Find text in list by regexp
    Return value is list of tuples with following format

    Args:
        recognized_list (list): list of text to match with pattern.(format tuple: ROI,text)
        pattern (str): regexp pattern to match

    Returns:
        (roi,text, substring):
            roi(ROI): region of interest
            text (str): full text which resides in rectangle
            substring (str): substring found in text
    """
    result = []
    for roi, content in recognized_list:
        match = re.findall(pattern, content)
        if len(match) > 0:
            result.append((roi, content, tuple(match)))
    return list(set(result))




[docs]
def contrast(img: Image):
    """Function that determines the minimum and
    maximum brightness and contrast values on the image itself.
    The metrics are calculated using the YCbCr color model.
    Image.convert supports all possible conversions between “L”, “RGB” and “CMYK”.
    https://pillow.readthedocs.io/en/latest/reference/Image.html#PIL.Image.Image.convert

    Args:
        img (Image): Pil.Image that is converted from the BGR color space to YUV

    Returns:
        (contr):
            contr (float): contrast value on the image
    """
    MAX_CONTRAST = 21
    MIN_CONTRAST = 1
    image = np.array(img.convert('YCbCr'))
    Y = image[:, :, 0]
    br_min, br_max = np.min(Y), np.max(Y)
    contr = round((br_max + 0.05) / (br_min + 0.05), 3)
    # https://www.w3.org/TR/WCAG21/
    # According to WCAG, the contrast is defined in the range from 1 to 21
    contr = min(MAX_CONTRAST, max(MIN_CONTRAST, contr))
    return float(contr)




[docs]
def find_keypoints(img: Image):
    """Function that uses the SIFT algorithm to find keypoints in an image.
    The function returns three values, one of which contains the coordinates of the key points,
    which simplifies further use of the data.

    Args:
        img (Image): Pil.Image which is used to search for keypoints

    Returns:
        (keypoints, descriptors, coord_list):
            keypoints (tuple): The detected keypoints
            descriptors (numpy.ndarray): Computed descriptors
            coord_list (numpy.ndarray): Array of coordinates of keypoints
    """
    if img.mode == "L":
        gray = np.array(img)
    else:
        gray = cv.cvtColor(np.array(img), cv.COLOR_BGR2GRAY)
    sift = cv.SIFT_create()
    keypoints, descriptors = sift.detectAndCompute(gray, None)
    coord_list = []
    for kp in keypoints:
        x, y = kp.pt
        coord_list.append([x, y])
    coord_list = np.array(coord_list)
    return keypoints, descriptors, coord_list




[docs]
def hdbscan_cluster(keypoints: tuple, coord_list: np.ndarray, min_cluster_size: Optional[int] = 5,  # pylint: disable=R0914, R0917
                    min_samples: Union[int, float] = None,
                    cluster_selection_epsilon: Optional[float] = 0.0,
                    margins: Optional[tuple] = (0, 0)):
    """Function that performs clusterization of keypoints using their coordinates and HDBSCAN
    The function is used for found coordinates and keypoints.
    https://scikit-learn.org/stable/modules/generated/sklearn.cluster.HDBSCAN.html#r6f313792b2b7-5

    Args:
        keypoints (tuple): Distinctive points in an image
        coord_list (np.ndarray): Array of coordinates of keypoints
        min_cluster_size (int): Min number of samples that allows to consider a group as a cluster;
        min_samples (int | float): Calculate the distance between a point and its nearest neighbor
        cluster_selection_epsilon (float): Distance threshold
        margins (tuple): Tuple of values for symmetrical boundary changes along x, y

    Returns:
        (clusters):
            clusters(list): list of cluster objects containing detailed information about
            labels, keypoints and rectangles

    """
    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size,
        min_samples=min_samples,
        cluster_selection_epsilon=cluster_selection_epsilon,
        gen_min_span_tree=True
    )
    clusterer.fit(coord_list)
    labels = clusterer.labels_
    clusters = []
    for label in set(labels):
        if label != -1:
            cluster_points = coord_list[labels == label]
            keypoints_in_cluster = []
            labels_in_cluster = []
            if len(cluster_points) > 0:
                x_coordinates = [point[0] for point in cluster_points]
                y_coordinates = [point[1] for point in cluster_points]
                x_min = int(min(x_coordinates))
                y_min = int(min(y_coordinates))
                x_max = int(max(x_coordinates))
                y_max = int(max(y_coordinates))
                coord_rect = (x_min - margins[0], y_min - margins[1],
                              x_max + margins[0], y_max + margins[1])
                for kp in keypoints:
                    x, y = kp.pt
                    if x_min <= x <= x_max and y_min <= y <= y_max:
                        keypoints_in_cluster.append(kp)
                        labels_in_cluster.append(int(label))
                cluster = KeypointsCluster(keypoints_in_cluster, labels_in_cluster, coord_rect)
                clusters.append(cluster)
    return clusters




[docs]
def image_difference(img_1: Image, img_2: Image):
    """Function that calculates the difference between two images and returns the coordinates of
    rectangles enclosing the areas where these differences are observed.

    Args:
        img_1 (Image): First image
        img_2 (Image): Second image

    Returns:
        (coord_rect):
            coord_rect(tuple): Tuple with the coordinates of all the bounding boxes
            that enclose the regions of difference between the two images

    """
    gray_1 = cv.cvtColor(np.array(img_1), cv.COLOR_BGR2GRAY)
    gray_2 = cv.cvtColor(np.array(img_2), cv.COLOR_BGR2GRAY)
    diff = cv.absdiff(gray_1, gray_2)
    _, thresh = cv.threshold(diff, 30, 255, cv.THRESH_BINARY)
    contours, _ = cv.findContours(thresh, cv.RETR_EXTERNAL, cv.CHAIN_APPROX_SIMPLE)
    coord_rect = []
    for contour in contours:
        (x, y, w, h) = cv.boundingRect(contour)
        coord = (x, y, x + w, y + h)
        coord_rect.append(coord)
    return tuple(coord_rect)