U
    cc,~                      @   s  d dl mZmZmZ d dlmZ d dlmZmZ d dl	Z	d dl
Z
d dlmZmZ d dlZddlmZmZmZmZ ddlmZ dd	lmZ ed
d eD Zedd eD Zedd eD ZeeddgB ZdZejred dkreddkste
 edd e!d d Z"n
e
 eZ"ddddddddddddd d!d"d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1d2d3h Z#e
 d4Z$i Z%G d5d6 d6e&Z'd7d8 Z(G d9d: d:e&Z)G d;d< d<e)Z*G d=d> d>e+Z,G d?d@ d@e&Z-G dAdB dBe&Z.dCdD Z/dS )E    )absolute_importdivisionunicode_literals)	text_type)http_clienturllibN)BytesIOStringIO   )EOFspaceCharactersasciiLettersasciiUppercase)_ReparseException)_utilsc                 C   s   g | ]}| d qS asciiencode.0item r   O/var/www/html/project/venv/lib/python3.8/site-packages/html5lib/_inputstream.py
<listcomp>   s     r   c                 C   s   g | ]}| d qS r   r   r   r   r   r   r      s     c                 C   s   g | ]}| d qS r   r   r   r   r   r   r      s        >   <u   [---﷐-﷯￾￿🿾🿿𯿾𯿿𿿾𿿿񏿾񏿿񟿾񟿿񯿾񯿿񿿾񿿿򏿾򏿿򟿾򟿿򯿾򯿿򿿾򿿿󏿾󏿿󟿾󟿿󯿾󯿿󿿾󿿿􏿾􏿿]]z"\uD800-\uDFFF"i i i i i i i i i i i i i i i i i	 i	 i
 i
 i i i i i i i i i i i i z[	- -/:-@\[-`{-~]c                   @   sH   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dS )BufferedStreamzBuffering for streams that do not have buffering of their own

    The buffer is implemented as a list of chunks on the assumption that
    joining many strings will be slow since it is O(n**2)
    c                 C   s   || _ g | _ddg| _d S )Nr   r   )streambufferposition)selfr    r   r   r   __init__:   s    zBufferedStream.__init__c                 C   s<   d}| j d | jd  D ]}|t|7 }q|| jd 7 }|S Nr   r
   )r!   r"   len)r#   poschunkr   r   r   tell?   s
    zBufferedStream.tellc                 C   sT   ||   kst|}d}t| j| |k rF|t| j| 8 }|d7 }q||g| _d S r%   )_bufferedBytesAssertionErrorr&   r!   r"   )r#   r'   offsetir   r   r   seekF   s    
zBufferedStream.seekc                 C   sT   | j s| |S | jd t| j krF| jd t| j d krF| |S | |S d S )Nr   r
   r   )r!   _readStreamr"   r&   _readFromBufferr#   bytesr   r   r   readO   s    

zBufferedStream.readc                 C   s   t dd | jD S )Nc                 S   s   g | ]}t |qS r   )r&   r   r   r   r   r   Y   s     z1BufferedStream._bufferedBytes.<locals>.<listcomp>)sumr!   r#   r   r   r   r*   X   s    zBufferedStream._bufferedBytesc                 C   s<   | j |}| j| | jd  d7  < t|| jd< |S r%   )r    r3   r!   appendr"   r&   )r#   r2   datar   r   r   r/   [   s
    zBufferedStream._readStreamc                 C   s   |}g }| j d }| j d }|t| jk r|dkr|dks>t| j| }|t|| krl|}||| g| _ n"t|| }|t|g| _ |d7 }|||||   ||8 }d}q|r|| | d|S )Nr   r
       )r"   r&   r!   r+   r6   r/   join)r#   r2   ZremainingBytesrvZbufferIndexZbufferOffsetZbufferedDataZbytesToReadr   r   r   r0   b   s&    


zBufferedStream._readFromBufferN)__name__
__module____qualname____doc__r$   r)   r.   r3   r*   r/   r0   r   r   r   r   r   3   s   		r   c                 K   s   t | tjs(t | tjjr.t | jtjr.d}n&t| drJt | dt	}n
t | t	}|rdd |D }|rvt
d| t| f|S t| f|S d S )NFr3   r   c                 S   s   g | ]}| d r|qS )	_encoding)endswith)r   xr   r   r   r      s     
 z#HTMLInputStream.<locals>.<listcomp>z3Cannot set an encoding with a unicode input, set %r)
isinstancer   HTTPResponser   responseaddbasefphasattrr3   r   	TypeErrorHTMLUnicodeInputStreamHTMLBinaryInputStream)sourcekwargsZ	isUnicode	encodingsr   r   r   HTMLInputStream}   s    

rN   c                   @   sp   e Zd ZdZdZdd Zdd Zdd Zd	d
 Zdd Z	dd Z
dddZdd Zdd ZdddZdd ZdS )rI   Provides a unicode stream of characters to the HTMLTokenizer.

    This class takes care of character encoding and removing or replacing
    incorrect byte-sequences and also provides column and line tracking.

    i (  c                 C   sZ   t jsd| _ntddkr$| j| _n| j| _dg| _tddf| _| 	|| _
|   dS )  Initialises the HTMLInputStream.

        HTMLInputStream(source, [encoding]) -> Normalized stream from source
        for use by html5lib.

        source can be either a file-object, local filename or a string.

        The optional encoding parameter must be a string that indicates
        the encoding.  If specified, that encoding will be used,
        regardless of any BOM or later declaration (such as in a meta
        element)

        Nu   􏿿r
   r   utf-8certain)r   supports_lone_surrogatesreportCharacterErrorsr&   characterErrorsUCS4characterErrorsUCS2ZnewLineslookupEncodingcharEncoding
openStream
dataStreamreset)r#   rK   r   r   r   r$      s    
zHTMLUnicodeInputStream.__init__c                 C   s.   d| _ d| _d| _g | _d| _d| _d | _d S )N r   )r(   	chunkSizechunkOffseterrorsprevNumLinesprevNumCols_bufferedCharacterr5   r   r   r   r[      s    zHTMLUnicodeInputStream.resetc                 C   s   t |dr|}nt|}|S zvProduces a file object from source.

        source can be either a file object, local filename or a string.

        r3   )rG   r	   r#   rK   r    r   r   r   rY      s    
z!HTMLUnicodeInputStream.openStreamc                 C   sT   | j }|dd|}| j| }|dd|}|dkr@| j| }n||d  }||fS )N
r   r   r
   )r(   countr`   rfindra   )r#   r,   r(   ZnLinesZpositionLineZlastLinePosZpositionColumnr   r   r   	_position   s    
z HTMLUnicodeInputStream._positionc                 C   s   |  | j\}}|d |fS )z:Returns (line, col) of the current position in the stream.r
   )rh   r^   )r#   linecolr   r   r   r"      s    zHTMLUnicodeInputStream.positionc                 C   s6   | j | jkr|  stS | j }| j| }|d | _ |S )zo Read one character from the stream or queue if available. Return
            EOF when EOF is reached.
        r
   )r^   r]   	readChunkr   r(   )r#   r^   charr   r   r   rl      s    

zHTMLUnicodeInputStream.charNc                 C   s   |d kr| j }| | j\| _| _d| _d| _d| _| j|}| j	rX| j	| }d | _	n|s`dS t
|dkrt|d }|dksd|  krdkrn n|d | _	|d d }| jr| | |d	d
}|dd
}|| _t
|| _dS )Nr\   r   Fr
   r         i  z
re   T)_defaultChunkSizerh   r]   r`   ra   r(   r^   rZ   r3   rb   r&   ordrT   replace)r#   r]   r7   Zlastvr   r   r   rk      s0    
 


z HTMLUnicodeInputStream.readChunkc                 C   s(   t tt|D ]}| jd qd S )Ninvalid-codepoint)ranger&   invalid_unicode_refindallr_   r6   )r#   r7   _r   r   r   rU     s    z*HTMLUnicodeInputStream.characterErrorsUCS4c                 C   s   d}t |D ]}|rqt| }| }t|||d  rrt|||d  }|tkrl| j	
d d}q|dkr|dkr|t|d kr| j	
d qd}| j	
d qd S )NF   rs   Trn   i  r
   )ru   finditerrq   groupstartr   ZisSurrogatePairZsurrogatePairToCodepointnon_bmp_invalid_codepointsr_   r6   r&   )r#   r7   skipmatch	codepointr'   Zchar_valr   r   r   rV   #  s"    z*HTMLUnicodeInputStream.characterErrorsUCS2Fc           
      C   s  zt ||f }W nh tk
rx   |D ]}t|dk s$tq$ddd |D }|sZd| }td|  }t ||f< Y nX g }|| j| j	}|dkr| j	| j
krqn0| }|| j
kr|| j| j	|  || _	q|| j| j	d  |  s~qq~d|}	|	S )z Returns a string of characters from the stream up to but not
        including any character in 'characters' or EOF. 'characters' must be
        a container that supports the 'in' method and iteration over its
        characters.
           r\   c                 S   s   g | ]}d t | qS )z\x%02x)rq   )r   cr   r   r   r   H  s     z5HTMLUnicodeInputStream.charsUntil.<locals>.<listcomp>z^%sz[%s]+N)charsUntilRegExKeyErrorrq   r+   r9   recompiler~   r(   r^   r]   endr6   rk   )
r#   
charactersZoppositecharsr   regexr:   mr   rr   r   r   
charsUntil:  s0     

z!HTMLUnicodeInputStream.charsUntilc                 C   sT   |t k	rP| jdkr.|| j | _|  jd7  _n"|  jd8  _| j| j |ksPtd S r%   )r   r^   r(   r]   r+   )r#   rl   r   r   r   ungeti  s    
zHTMLUnicodeInputStream.unget)N)F)r;   r<   r=   r>   rp   r$   r[   rY   rh   r"   rl   rk   rU   rV   r   r   r   r   r   r   rI      s    
&
/rI   c                   @   sL   e Zd ZdZdddZdd Zd	d
 ZdddZdd Zdd Z	dd Z
dS )rJ   rO   Nwindows-1252Tc                 C   sn   |  || _t| | j d| _d| _|| _|| _|| _|| _	|| _
| || _| jd dk	sbt|   dS )rP   i   d   r   N)rY   	rawStreamrI   r$   numBytesMetanumBytesChardetoverride_encodingtransport_encodingsame_origin_parent_encodinglikely_encodingdefault_encodingdetermineEncodingrX   r+   r[   )r#   rK   r   r   r   r   r   Z
useChardetr   r   r   r$     s    zHTMLBinaryInputStream.__init__c                 C   s&   | j d j| jd| _t|  d S )Nr   rr   )rX   Z
codec_infostreamreaderr   rZ   rI   r[   r5   r   r   r   r[     s    zHTMLBinaryInputStream.resetc                 C   sL   t |dr|}nt|}z||  W n tk
rF   t|}Y nX |S rc   )rG   r   r.   r)   	Exceptionr   rd   r   r   r   rY     s    
z HTMLBinaryInputStream.openStreamc                 C   s  |   df}|d d k	r|S t| jdf}|d d k	r:|S t| jdf}|d d k	rX|S |  df}|d d k	rt|S t| jdf}|d d k	r|d jds|S t| jdf}|d d k	r|S |rpzddl	m
} W n tk
r   Y nX g }| }|js<| j| j}t|tst|s&q<|| || q|  t|jd }| jd |d k	rp|dfS t| jdf}|d d k	r|S tddfS )NrR   r   Z	tentativezutf-16)UniversalDetectorencodingr   )	detectBOMrW   r   r   detectEncodingMetar   name
startswithr   Zchardet.universaldetectorr   ImportErrordoner   r3   r   rB   r2   r+   r6   feedcloseresultr.   r   )r#   chardetrX   r   buffersdetectorr!   r   r   r   r   r     sR    

z'HTMLBinaryInputStream.determineEncodingc                 C   s   | j d dkstt|}|d kr&d S |jdkrFtd}|d k	stnT|| j d krf| j d df| _ n4| jd |df| _ |   td| j d |f d S )Nr
   rR   utf-16beutf-16lerQ   r   zEncoding changed from %s to %s)rX   r+   rW   r   r   r.   r[   r   )r#   ZnewEncodingr   r   r   changeEncoding  s    

z$HTMLBinaryInputStream.changeEncodingc              
   C   s   t jdt jdt jdt jdt jdi}| jd}t|t	s<t
||dd }d}|s~||}d}|s~||dd	 }d	}|r| j| t|S | jd
 dS dS )zAttempts to detect at BOM at the start of the stream. If
        an encoding can be determined from the BOM return the name of the
        encoding otherwise return NonerQ   r   r   zutf-32lezutf-32be   N   rx   r   )codecsBOM_UTF8BOM_UTF16_LEBOM_UTF16_BEBOM_UTF32_LEBOM_UTF32_BEr   r3   rB   r2   r+   getr.   rW   )r#   ZbomDictstringr   r.   r   r   r   r     s4           
zHTMLBinaryInputStream.detectBOMc                 C   sV   | j | j}t|tstt|}| j d | }|dk	rR|j	dkrRt
d}|S )z9Report the encoding declared by the meta element
        r   Nr   rQ   )r   r3   r   rB   r2   r+   EncodingParserr.   getEncodingr   rW   )r#   r!   parserr   r   r   r   r   3  s    z(HTMLBinaryInputStream.detectEncodingMeta)NNNNr   T)T)r;   r<   r=   r>   r$   r[   rY   r   r   r   r   r   r   r   r   rJ   z  s          
*
>"rJ   c                   @   s   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dd Zeee
Zdd ZeeZefddZdd Zdd Zdd ZdS )EncodingByteszString-like object with an associated position and various extra methods
    If the position is ever greater than the string length then an exception is
    raisedc                 C   s   t |tstt| | S N)rB   r2   r+   __new__lowerr#   valuer   r   r   r   F  s    zEncodingBytes.__new__c                 C   s
   d| _ d S )Nr   )rh   r   r   r   r   r$   J  s    zEncodingBytes.__init__c                 C   s   | S r   r   r5   r   r   r   __iter__N  s    zEncodingBytes.__iter__c                 C   s>   | j d  }| _ |t| kr"tn|dk r.t| ||d  S )Nr
   r   rh   r&   StopIterationrH   r#   pr   r   r   __next__Q  s    zEncodingBytes.__next__c                 C   s   |   S r   )r   r5   r   r   r   nextY  s    zEncodingBytes.nextc                 C   sB   | j }|t| krtn|dk r$t|d  | _ }| ||d  S r%   r   r   r   r   r   previous]  s    zEncodingBytes.previousc                 C   s   | j t| krt|| _ d S r   rh   r&   r   )r#   r"   r   r   r   setPositionf  s    zEncodingBytes.setPositionc                 C   s*   | j t| krt| j dkr"| j S d S d S )Nr   r   r5   r   r   r   getPositionk  s
    
zEncodingBytes.getPositionc                 C   s   | | j | j d  S Nr
   )r"   r5   r   r   r   getCurrentByteu  s    zEncodingBytes.getCurrentBytec                 C   sH   | j }|t| k r>| ||d  }||kr4|| _|S |d7 }q|| _dS )zSkip past a list of charactersr
   Nr"   r&   rh   r#   r   r   r   r   r   r   r}   z  s    
zEncodingBytes.skipc                 C   sH   | j }|t| k r>| ||d  }||kr4|| _|S |d7 }q|| _d S r   r   r   r   r   r   	skipUntil  s    
zEncodingBytes.skipUntilc                 C   s(   |  || j}|r$|  jt|7  _|S )zLook for a sequence of bytes at the start of a string. If the bytes
        are found return True and advance the position to the byte after the
        match. Otherwise return False and leave the position alone)r   r"   r&   )r#   r2   r:   r   r   r   
matchBytes  s    zEncodingBytes.matchBytesc                 C   s>   z |  || jt| d | _W n tk
r8   tY nX dS )zLook for the next sequence of bytes matching a given sequence. If
        a match is found advance the position to the last byte of the matchr
   T)indexr"   r&   rh   
ValueErrorr   r1   r   r   r   jumpTo  s
     
zEncodingBytes.jumpToN)r;   r<   r=   r>   r   r$   r   r   r   r   r   r   propertyr"   r   currentBytespaceCharactersBytesr}   r   r   r   r   r   r   r   r   B  s    	
	r   c                   @   sX   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dd Zdd ZdS )r   z?Mini parser for detecting character encoding from meta elementsc                 C   s   t || _d| _dS )z3string - the data to work on for encoding detectionN)r   r7   r   r#   r7   r   r   r   r$     s    
zEncodingParser.__init__c              
   C   s   d| j krd S d| jfd| jfd| jfd| jfd| jfd| jff}| j D ]}d}z| j d W n tk
rz   Y  qY nX |D ]D\}}| j |rz| }W  qW q tk
r   d}Y  qY qX q|sH qqH| j	S )	Ns   <metas   <!--s   </s   <!s   <?r   TF)
r7   handleComment
handleMetahandlePossibleEndTaghandleOtherhandlePossibleStartTagr   r   r   r   )r#   ZmethodDispatchrw   ZkeepParsingkeymethodr   r   r   r     s4    


zEncodingParser.getEncodingc                 C   s   | j dS )zSkip over commentss   -->r7   r   r5   r   r   r   r     s    zEncodingParser.handleCommentc                 C   s   | j jtkrdS d}d }|  }|d kr,dS |d dkr\|d dk}|r|d k	r|| _dS q|d dkr|d }t|}|d k	r|| _dS q|d dkrtt|d }| }|d k	rt|}|d k	r|r|| _dS |}qd S )	NTFr   s
   http-equivr
   s   content-type   charsets   content)	r7   r   r   getAttributer   rW   ContentAttrParserr   parse)r#   Z	hasPragmaZpendingEncodingattrZtentativeEncodingcodecZcontentParserr   r   r   r     s8    zEncodingParser.handleMetac                 C   s
   |  dS )NF)handlePossibleTagr5   r   r   r   r     s    z%EncodingParser.handlePossibleStartTagc                 C   s   t | j | dS )NT)r   r7   r   r5   r   r   r   r     s    
z#EncodingParser.handlePossibleEndTagc                 C   sb   | j }|jtkr(|r$|  |   dS |t}|dkrD|  n|  }|d k	r^|  }qLdS )NTr   )r7   r   asciiLettersBytesr   r   r   spacesAngleBracketsr   )r#   ZendTagr7   r   r   r   r   r   r     s    



z EncodingParser.handlePossibleTagc                 C   s   | j dS )Nr   r   r5   r   r   r   r     s    zEncodingParser.handleOtherc                 C   s  | j }|ttdgB }|dks2t|dks2t|dkr>dS g }g }|dkrV|rVqnX|tkrj| }qnD|dkrd|dfS |tkr||	  n|dkrdS || t
|}qF|dkr|  d|dfS t
| | }|dkrJ|}t
|}||kr"t
| d|d|fS |tkr<||	  q|| qnJ|d	krbd|dfS |tkr|||	  n|dkrdS || t
|}|tkrd|d|fS |tkr||	  n|dkrdS || qdS )
z_Return a name,value pair for the next attribute in the stream,
        if one is found, or None   /Nr
   )r   N   =)r   r   r8   )   '   "r   )r7   r}   r   	frozensetr&   r+   r9   asciiUppercaseBytesr6   r   r   r   r   )r#   r7   r   attrName	attrValue	quoteCharr   r   r   r     sb    











zEncodingParser.getAttributeN)r;   r<   r=   r>   r$   r   r   r   r   r   r   r   r   r   r   r   r   r     s   $r   c                   @   s   e Zd Zdd Zdd ZdS )r   c                 C   s   t |tst|| _d S r   )rB   r2   r+   r7   r   r   r   r   r$   a  s    zContentAttrParser.__init__c                 C   s   z| j d | j  jd7  _| j   | j jdks<W d S | j  jd7  _| j   | j jdkr| j j}| j  jd7  _| j j}| j |r| j || j j W S W d S nR| j j}z"| j t | j || j j W W S  tk
 r   | j |d   Y W S X W n tk
r   Y d S X d S )Nr   r
   r   )r   r   )r7   r   r"   r}   r   r   r   r   )r#   Z	quoteMarkZoldPositionr   r   r   r   e  s.    

zContentAttrParser.parseN)r;   r<   r=   r$   r   r   r   r   r   r   `  s   r   c                 C   sf   t | tr0z| d} W n tk
r.   Y dS X | dk	r^zt| W S  tk
rZ   Y dS X ndS dS )z{Return the python codec name corresponding to an encoding or None if the
    string doesn't correspond to a valid encoding.r   N)rB   r2   decodeUnicodeDecodeErrorwebencodingslookupAttributeError)r   r   r   r   rW     s    

rW   )0
__future__r   r   r   sixr   Z	six.movesr   r   r   r   ior   r	   r   	constantsr   r   r   r   r   r\   r   r   r   r   r   r   Zinvalid_unicode_no_surrogaterS   rf   r+   r   evalru   r|   Zascii_punctuation_rer   objectr   rN   rI   rJ   r2   r   r   r   rW   r   r   r   r   <module>   s   

                     
J g Ib ='