
    kh                         d dl Z d dlZd dlZd dlZddlmZmZ ddl mZ ddl	m
Z
  e
       rd dlZ G d de      Z G d	 d
e      Zy)    N   )BaseOCRAgentBaseOCRElementType   )load_dataframe)is_pytesseract_availablec                   D    e Zd ZdZdZdZdZdZdZe	d        Z
e	d        Zy	)
TesseractFeatureTypez7
    The element types for Tesseract Detection API
    r   r   r         c           
          t         j                  dt         j                  dt         j                  dt         j                  dt         j
                  di}||    S )Npage_num	block_numpar_numline_numword_num)r
   PAGEBLOCKPARALINEWORD)selfname_cvts     \/var/www/teggl/fontify/venv/lib/python3.12/site-packages/layoutparser/ocr/tesseract_agent.py	attr_namezTesseractFeatureType.attr_name(   sN     !%%z && %%y %%z %%z
 ~    c                     g d}|d | dz    S )N)r   r   r   r   r   r    )r   levelss     r   group_levelsz!TesseractFeatureType.group_levels3   s    Mjq!!r   N)__name__
__module____qualname____doc__r   r   r   r   r   propertyr   r    r   r   r   r
   r
      sH     DEDDD  " "r   r
   c                   n    e Zd ZdZdgZddZed        Zd Z	 ddZ	e
d        Ze
d	        Ze
d
        Zy)TesseractAgentz
    A wrapper for `Tesseract <https://github.com/tesseract-ocr/tesseract>`_ Text
    Detection APIs based on `PyTesseract <https://github.com/tesseract-ocr/tesseract>`_.
    pytesseractc                 b    t        |t              r|ndj                  |      | _        || _        y)af  Create a Tesseract OCR Agent.

        Args:
            languages (:obj:`list` or :obj:`str`, optional):
                You can specify the language code(s) of the documents to detect to improve
                accuracy. The supported language and their code can be found on
                `its github repo <https://github.com/tesseract-ocr/langdata>`_.
                It supports two formats: 1) you can pass in the languages code as a string
                of format like `"eng+fra"`, or 2) you can pack them as a list of strings
                `["eng", "fra"]`.
                Defaults to 'eng'.
        +N)
isinstancestrjoinlangconfigs)r   	languageskwargss      r   __init__zTesseractAgent.__init__A   s&     ",Is!;I)AT	r   c                 <    |t         j                   _         | di |S )Nr   )r(   tesseract_cmd)clstesseract_cmd_pathr1   s      r   with_tesseract_executablez(TesseractAgent.with_tesseract_executableQ   s     1C-}V}r   c                 H   i }t        j                  |fd| j                  i| j                  |d<   t        j                  |fd| j                  i| j                  }t        j                  t        j                  |      t        j                  dddt        i      |d<   |S )Nr.   textzutf-8	)quotingencodingsep
convertersdata)r(   image_to_stringr.   r/   image_to_datapdread_csvioStringIOcsv
QUOTE_NONEr,   )r   img_contentres_datas       r   _detectzTesseractAgent._detectW   s    !11
"ii
+/<<
F ))+VDIIVVkkKKNN}
F 
r   Nc                 l    | j                  |      }|r|S |r|d   S || j                  ||      S |d   S )a  Send the input image for OCR.

        Args:
            image (:obj:`np.ndarray` or :obj:`str`):
                The input image array or the name of the image file
            return_response (:obj:`bool`, optional):
                Whether directly return all output (string and boxes
                info) from Tesseract.
                Defaults to `False`.
            return_only_text (:obj:`bool`, optional):
                Whether return only the texts in the OCR results.
                Defaults to `False`.
            agg_output_level (:obj:`~TesseractFeatureType`, optional):
                When set, aggregate the GCV output with respect to the
                specified aggregation level. Defaults to `None`.
        r9   )rK   gather_data)r   imagereturn_responsereturn_only_textagg_output_levelrI   s         r   detectzTesseractAgent.detectf   sK    ( ll5!Jv;'##C)9::6{r   c           
         t        |t              s
J d|        | d   }||j                  j                             j	                  |j
                        j                  d       j                  d      j                         j                  dddd	d
ddd      j                  d d d      j                  dd	g      }t        |      S )zo
        Gather the OCR'ed text, bounding boxes, and confidence
        in a given aggeragation level.
        zInvalid agg_level r?   c           
         t        j                  | d   j                         | d   j                         | d   j                         | d   j                         | d   j	                         | d   j
                  j                  d      g      S )	Nlefttopwidthheightconfr9    )r=   )rB   Seriesminmaxmeanr,   cat)gps    r   <lambda>z,TesseractAgent.gather_data.<locals>.<lambda>   st    2996
(5	7)8((*6
)6
**s*3	 r   T)dropx_1y_1whscorer9   id)r   r   r   r   r      index)columnsc                 4    | j                   | j                  z   S N)rc   re   xs    r   ra   z,TesseractAgent.gather_data.<locals>.<lambda>       aeeacck r   c                 4    | j                   | j                  z   S rm   )rd   rf   rn   s    r   ra   z,TesseractAgent.gather_data.<locals>.<lambda>   rp   r   	rectangle)x_2y_2
block_type)r+   r
   r9   isnagroupbyr    applyreset_indexrenameassignrb   r   )response	agg_levelrI   dfs       r   rM   zTesseractAgent.gather_data   s     +
 	,	{+	, 
 v !WY++,U	 [d[#[]V!  
 V))&  
 T3*T%C 	H b!!r   c                 t    t        | d      5 }t        j                  |      }d d d        |S # 1 sw Y   S xY w)Nrb)openpickleload)filenamefprI   s      r   load_responsezTesseractAgent.load_response   s5    (D! 	"R++b/C	"
	"
s   -7c                     t        |d      5 }t        j                  | |t        j                         d d d        y # 1 sw Y   y xY w)Nwb)protocol)r   r   dumpHIGHEST_PROTOCOL)rI   	file_namer   s      r   save_responsezTesseractAgent.save_response   s?     )T" 	CbKKR&*A*AB	C 	C 	Cs	   '=A)eng)FTN)r!   r"   r#   r$   DEPENDENCIESr2   classmethodr7   rK   rR   staticmethodrM   r   r   r   r   r   r'   r'   9   sz    
 "?L   
  UYB -" -"^  
 C Cr   r'   )rD   rF   r   pandasrB   baser   r   r   
file_utilsr   r(   r
   r'   r   r   r   <module>r      sB    
 
   2  1"- "8HC\ HCr   