
    khK                     `    d Z ddlmZmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZ  G d d	e	e      Zy
)aC  Document layout depends on Blocks and Shapes.

**Layout** here refers to the content and position of text, image and table. The target is to convert
source blocks and shapes to a *flow layout* that can be re-created as docx elements like paragraph and
table. In addition to ``Section`` and ``Column``, ``TableBlock`` is used to maintain the page layout .
So, detecting and parsing table block is the principle steps.

The prerequisite work is done before this step:

1. Clean up source blocks and shapes in Page level, e.g. convert source blocks to ``Line`` level,
   because the block structure determined by ``PyMuPDF`` might be not reasonable.
#. Parse structure in document level, e.g. page header/footer.
#. Parse Section and Column layout in Page level.

The page layout parsing idea:

1. Parse table layout in Column level.
    (a) Detect explicit tables first based on shapes.
    (#) Then, detect stream tables based on original text blocks and parsed explicit tables.
    (#) Move table contained blocks (lines or explicit table) to associated cell-layout.
#. Parse paragraph in Column level.
    (a) Detect text blocks by combining related lines.
    (#) Parse paragraph style, e.g. text format, alignment
#. Calculate vertical spacing based on parsed tables and paragraphs.
#. Repeat above steps for cell-layout in parsed table level.
    )ABCabstractmethod   )Line)	constants)Element)Shapesc                        e Zd ZdZd fd	Zeed               Z fdZde	fdZ
defdZd	efd
Zd Zd Zd Zd Z xZS )Layoutz(Blocks and shapes structure and formats.c                     ddl m } ddlm} |rd|ini }t        |   |        ||       | _        t        |       | _         ||       | _        y)z Initialize layout. Note that layout bbox must be set explicitly,
        rather than calculated automatically from contained blocks and shapes.   )Blocksr   )TablesConstructorbbox)parentN)	r   table.TablesConstructorr   super__init__blocksr	   shapes_table_parser)selfr   r   r   raw	__class__s        R/var/www/teggl/fontify/venv/lib/python3.12/site-packages/pdf2docx/layout/Layout.pyr   zLayout.__init__&   sP     	#? $vtn" D) D) /d;    c                      y)zWorking bbox of current Layout.N )r   s    r   working_bboxzLayout.working_bbox9   s    r   c                     t         |          }|j                  | j                  j                         | j                  j                         d       |S )z#Store parsed layout in dict format.)r   r   )r   storeupdater   r   )r   resr   s     r   r!   zLayout.store?   sF    gmo

kk'')kk'')
 	 
r   datac                     | j                  |j                  dd             | j                  j                  |j                  dg              | j                  j                  |j                  dg              | S )z#Restore Layout from parsed results.r   )r   r   r   r   r   r   )update_bboxgetr   restorer   )r   r$   s     r   r(   zLayout.restoreI   sZ    &&12DHHXr23DHHXr23r   r   c                 4    |D ]  }| j                  |        y)zAdd blocks (line or table block) to this layout.

        Args:
            blocks (list): a list of text line or table block to add.

        .. note::
            If a text line is partly contained, it must deep into span -> char.
        N)_assign_block)r   r   blocks      r   assign_blockszLayout.assign_blocksQ   s     6ET//66r   r   c                     |D ]C  }| j                   j                  |j                        s)| j                  j	                  |       E y)zlAdd shapes to this cell.

        Args:
            shapes (list): a list of Shape instance to add.
        N)r   
intersectsr   r   append)r   r   shapes      r   assign_shapeszLayout.assign_shapes]   s>      	SE  ++EJJ79K9KE9R	Sr   c                     | j                   sy | j                  di |  | j                  di | t        d | j                         D ]  } |j                  di |  y)z]Parse layout.

        Args:
            settings (dict): Layout parsing parameters.
        Nc                     | j                   S N)is_table_block)es    r   <lambda>zLayout.parse.<locals>.<lambda>w   s    a&6&6 r   r   )r   _parse_table_parse_paragraphfilterparse)r   settingsr+   s      r   r;   zLayout.parseh   se     {{F 	%H% 	)) 6D 	$EEKK#(#	$r   c                 T   | j                  |t        j                        r| j                  j	                  |       yt        |t              r[| j                  j                  |j                        r5| j                  j	                  |j                  | j                               yyy)z/Add block (line or table block) to this layout.)	thresholdN)	containsr   FACTOR_MAJORr   r/   
isinstancer   r   r.   )r   r+   s     r   r*   zLayout._assign_block{   sw     ==)*@*@=AKKu% t$)=)=ejj)IKKu//		:; *J$r   c                     |d   r&| j                   j                  |d   |d   |d          |d   r'| j                   j                  |d   |d   |d          yy)a  Parse table layout:

        * detect explicit tables first based on shapes,
        * then stream tables based on original text blocks and parsed explicit tables;
        * move table contained blocks (text block or explicit table) to associated cell layout.
        parse_lattice_tableconnected_border_tolerancemin_border_clearancemax_border_widthparse_stream_tableline_separate_thresholdN)r   lattice_tablesstream_tablesr   r<   s     r   r8   zLayout._parse_table   s{     )*--56/0+,. (),,/0+,235 *r   c           	         | j                   j                  |d   |d   |d          | j                   j                  | j                  j                  |d          | j                   j                  |d   |d   |d   |d   |d   |d	          y
)zlCreate text block based on lines, and parse text format, e.g. text highlight,
        paragraph indentation max_line_spacing_ratioline_break_free_space_rationew_paragraph_free_space_ratiodelete_end_line_hyphenrH   line_break_width_ratiolines_left_aligned_thresholdlines_right_aligned_thresholdlines_center_aligned_thresholdN)r   parse_blockparse_text_formatr   text_style_shapesparse_spacingrK   s     r   r9   zLayout._parse_paragraph   s     	-.2356	8 	%%KK))-.	0
 	!!./-.23344556	8r   r4   )__name__
__module____qualname____doc__r   propertyr   r   r!   dictr(   listr,   r1   r;   r*   r8   r9   __classcell__)r   s   @r   r   r   #   sf    2<& .  .4 	74 	7S4 S$&<5,8r   r   N)r\   abcr   r   	text.Liner   commonr   common.Elementr   shape.Shapesr	   r   r   r   r   <module>rf      s+   6 &   $ !N8Wc N8r   