
    khw"                         d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZmZmZ dd	lmZ  ed
g d      Z G d de	      Zy)a(  Extract fonts properties from PDF.

Font properties like font name, size are covered in :py:class:`~pdf2docx.text.TextSpan`,
but more generic properties are required further:

* Font family name. The font name extracted and set in ``TextSpan`` might not valid when
  directly used in MS Word, e.g. "ArialMT" should be "Arial". So, we need to get font
  family name, which should be accepted by MS Word, based on the font file itself.

* Font line height ratio. As line height = font_size * line_height_ratio, it's used to
  calculate relative line spacing. In general, 1.12 is an approximate value to this ratio,
  but it's in fact a font-related value, especially for CJK font.

    * So, extract font metrics, e.g. ascender and descender, with third party library ``fontTools``
      in first priority. This can obtain an accurate line height ratio, but sometimes the
      embedded font data might crash.

    * Then, we have to use the default properties, i.e. ascender and descender, extracted by
      ``PyMuPDF`` directly, but this value isn't so accurate.
    N)BytesIO)
namedtuple)TTFont   )BaseCollection)CJK_CODEPAGE_BITSCJK_UNICODE_RANGE_BITSCJK_UNICODE_RANGES)decodeFont
descriptornameline_heightc                       e Zd ZdZdefdZed        Zed        Z	edefd       Z
edefd	       Zedefd
       Zedefd       Zy)Fontsz$Extracted fonts properties from PDF.	font_namec                     | j                  |      }| D ]  }||j                  k(  s|c S  | D ]  }||j                  v s|c S  | D ]  }|j                  |v s|c S  y)z.Get matched font by font name, or return None.N)_to_descriptorr   )selfr   targetfonts       O/var/www/teggl/fontify/venv/lib/python3.12/site-packages/pdf2docx/font/Fonts.pygetz	Fonts.get'   sz    $$Y/  	4Dt&t	4  	6D(+	6  	6D&(+	6     c                     t               }|D ]+  }|j                         D ]  }|j                  |d           - g }|D ]  }|j                  |      \  }}}	}
|st	        |      }| j                  |      }	 |dvsJ d       t        t        |
            }| j                  |      }| j                  |      }|j                  t        | j                  |      ||               | |      S # t        $ r d}Y Cw xY w)zExtract fonts from PDF and get properties.
        * Only embedded fonts (v.s. the base 14 fonts) can be extracted.
        * The extracted fonts may be invalid due to reason from PDF file itself.
        r   )zn/acffzbase font or not supported fontNr   )set	get_fontsaddextract_fontr   _normalized_font_namer   r   get_font_family_nameget_line_height_factor	Exceptionappendr   r   )clsfitz_docxrefspageffontsxrefbasenameext_bufferr   ttr   s                 r   extractzFonts.extract:   s     	7D^^%6uyy16	7  	*D'/'<'<T'B$Hc1fXh'H,,X6D# .0S2SS0 GFO,//3!88< LL--d3') *)	*2 5z  #"#s   8AC//C=<C=c                 N    | j                  d      d   j                  d      d   S )zMNormalize raw font name, e.g. BCDGEE+Calibri-Bold, BCDGEE+Calibri -> Calibri.+-r   )splitr   s    r   r"   zFonts._normalized_font_namec   s'     zz#r"((-a00r   r   c                 b    | j                  dd      j                  dd      j                         S )zBRemove potential space, dash in font name, and turn to upper case.  r7   )replaceupperr9   s    r   r   zFonts._to_descriptori   s+     ||C$,,S"5;;==r   tt_fontc                 J   dx}}d}d}| d   j                   D ]u  }d|j                  v r|j                  j                  d      }n|j                  j                  d      }|j                  |k(  r|s|}n|j                  |k(  r|s|}|sr|su n t        j                  |      S )z{Get the font family name from the font's names table.

        https://gist.github.com/pklaus/dce37521579513c574d0
        r<         r       z	utf-16-bezlatin-1)namesstringr   nameIDr   r"   )r?   r   familyFONT_SPECIFIER_NAME_IDFONT_SPECIFIER_FAMILY_IDrecordname_strs          r   r#   zFonts.get_font_family_nameo   s     v!"#$ fo++ 	&F&--'!==//<!==//	:}} 66t"::6!	& **622r   c                    | d   j                   }| d   }|j                  }|j                  }|j                  }|t	        |      z   }||z   }t
        j                  dk(  r<| d   }|j                  }	|j                  }
|	|
z   }t        d|||z
  z
        }||z   }|}n|}t        j                  |       }|rd|z  nd|z  }||z  S )as  Calculate line height ratio based on ``hhea`` and ``OS/2`` tables.

        Fon non-CJK fonts::

            f = (hhea.Ascent - hhea.Descent + hhea.LineGap) / units_per_em

        For non-CJK fonts (Windows)::

            f = (OS/2.winAscent + OS/2.winDescent + [External Leading]) / units_per_em
            External Leading = MAX(0, hhea.LineGap - ((OS/2.WinAscent + OS/2.winDescent) - (hhea.Ascent - hhea.Descent)))

        For CJK fonts::

            f = 1.3 * (hhea.Ascent - hhea.Descent) / units_per_em

        Read more:
        * https://docs.microsoft.com/en-us/typography/opentype/spec/recom#baseline-to-baseline-distances
        * https://github.com/source-foundry/font-line#baseline-to-baseline-distance-calculations
        * https://www.zhihu.com/question/23349103
        * https://github.com/source-foundry/font-line/blob/master/lib/fontline/metrics.py
        headhheantOS/2g        g?g      ?)
unitsPerEmascentdescentlineGapabsosr   usWinAscentusWinDescentmaxr   is_cjk_font)r?   units_per_emrN   hhea_ascenthhea_descenthhea_linegaphhea_total_heighthhea_btb_distanceos2os2_win_ascentos2_win_descentos2_win_total_heightwin_external_leadingwin_btb_distancebtb_distancecjkdistances                    r   r$   zFonts.get_line_height_factor   s    . v11 vkk||||'#l*;;.= 77D= &/C __N!..O#1O#C #&sL:NO`:`,a#b 36JJ+L -L (,/3((S5E,&&r   c                    | d   }t        j                         D ]&  \  }}t        |d      s|j                  d|z  z  s& y t	        j                         D ]  \  }}|t        dd      v r"t        |d      s!|j                  d|z  z  s4 y|t        dd      v r%t        |d      sQ|j                  d|dz
  z  z  sg y|t        dd	      v sxt        |d
      s|j                  d|dz
  z  z  s y 	 | j                         }|syt        D ],  }t        |d   |d   dz         D ]  }t        |      |v s  y . y#  Y yxY w)a  Test font object to confirm that it meets our definition of a CJK font file.

        The definition is met if any of the following conditions are True:
        1. The font has a CJK code page bit set in the OS/2 table
        2. The font has a CJK Unicode range bit set in the OS/2 table
        3. The font has any CJK Unicode code points defined in the cmap table

        https://github.com/googlefonts/fontbakery/blob/main/Lib/fontbakery/profiles/shared_conditions.py
        rP   ulCodePageRange1rB   Tr       @   ulCodePageRange2`   ulCodePageRange3F)r   itemshasattrrk   r	   rangeulUnicodeRange1ulUnicodeRange2ulUnicodeRange3getBestCmapr
   int)r?   ra   r0   bitcmapunicode_rangexs          r   rZ   zFonts.is_cjk_font   sj    fo (--/ 	FAss./C4H4HAQTH4U	
 -224 	 FAseArl"3 238K8KqTWx8Xb"%3 238K8KqUXY[U[}8]b"%3 238K8KqUXY[U[}8]	 	&&(D E/ 	 M=+]1-=a-?@  q6T> 	  	s   6D? ?EN)__name__
__module____qualname____doc__strr   classmethodr3   staticmethodr"   r   r   r#   r$   rZ    r   r   r   r   $   s    .C & % %P 1 1
 >C > >
 3V 3 34 5'v 5' 5'p ,F , ,r   r   )r   rV   ior   collectionsr   fontTools.ttLibr   common.Collectionr   common.constantsr   r	   r
   common.sharer   r   r   r   r   r   <module>r      sE   * 
  " " . ^ ^ ! & + ,
LN Lr   