B
    1dX                 @   s  d Z ddlZddlZddlZejddedd ddlmZmZ ddlm	Z	 dd	l
mZmZ dd
l
mZ ddlmZ ddlmZmZmZmZmZmZ ddlmZ ddlmZ ddlmZmZ ddlmZm Z  ddl!m"Z" e	j#Z$e"e$Z%e	j&d e	j&d e	j&d hZ'e	j&d Z(e	j&d Z)e	j&d Z*e	j&d Z+e,dZ-e,dZ.G dd dZ/G dd de Z0G dd deZ1dd Z2d d! Z3d"d# Z4e5d$Z6d%d& Z7G d'd( d(eZ8dS ))z
Shim module between Bleach and html5lib. This makes it easier to upgrade the
html5lib library without having to change a lot of code.
    Nignorez"html5lib's sanitizer is deprecatedzbleach._vendor.html5lib)messagecategorymodule)
HTMLParsergetTreeWalker)	constants)
namespacesprefixes)_ReparseException)Filter)allowed_protocolsallowed_css_propertiesallowed_svg_propertiesattr_val_is_urisvg_attr_val_allows_refsvg_allow_local_href)HTMLInputStream)escapeHTMLSerializer)attributeMapHTMLTokenizer)TrieZStartTagZEndTagZEmptyTagZ
CharactersZ
ParseError)paZabbraddressZareaarticleasideZaudiobbaseZbdiZbdo
blockquotebodybrZbuttonZcanvascaptionZcitecodecolZcolgroupdataZdatalistdddeldetailsdfndialogdivdldtZemZembedfieldset
figcaptionfigurefooterformh1h2h3h4h5h6headheaderhgrouphrhtmliZiframeZimginputZinsZkbdZkeygenlabelZlegendlilinkmapmarkZmenumetaZmeternavZnoscriptobjectolZoptgroupoptionoutputpparamZpicturepreprogressqrprtrubysZsampscriptsectionselectZslotZsmallsourcespanstrongstylesubsummarysuptableZtbodytdtemplateZtextareaZtfootthZtheadtimetitletrtrackuulvarZvideoZwbr)!r   r   r   r   r(   r*   r&   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r:   r;   r<   rA   mainrF   rH   rK   rM   rU   r^   rg   c               @   sf   e Zd ZdZdd Zedd Zedd Zedd	 Zd
d Z	dddZ
dd Zdd Zdd ZdS )InputStreamWithMemoryzWraps an HTMLInputStream to remember characters since last <

    This wraps existing HTMLInputStream classes to keep track of the stream
    since the last < which marked an open tag state.

    c             C   s$   || _ | j j| _| j j| _g | _d S )N)_inner_streamresetposition_buffer)selfZinner_stream rp   a/work/yifan.wang/ringdown/master-ringdown-env/lib/python3.7/site-packages/bleach/html5lib_shim.py__init__   s    

zInputStreamWithMemory.__init__c             C   s   | j jS )N)rk   errors)ro   rp   rp   rq   rs     s    zInputStreamWithMemory.errorsc             C   s   | j jS )N)rk   charEncoding)ro   rp   rp   rq   rt     s    z"InputStreamWithMemory.charEncodingc             C   s   | j jS )N)rk   changeEncoding)ro   rp   rp   rq   ru   
  s    z$InputStreamWithMemory.changeEncodingc             C   s   | j  }|r| j| |S )N)rk   charrn   append)ro   crp   rp   rq   rv     s    
zInputStreamWithMemory.charFc             C   s$   | j j||d}| jt| |S )N)opposite)rk   
charsUntilrn   extendlist)ro   
charactersry   charsrp   rp   rq   rz     s    z InputStreamWithMemory.charsUntilc             C   s   | j r| j d | j|S )N)rn   poprk   unget)ro   rv   rp   rp   rq   r     s    zInputStreamWithMemory.ungetc             C   s   d | jS )zReturns the stream history since last '<'

        Since the buffer starts at the last '<' as as seen by tagOpenState(),
        we know that everything from that point to when this method is called
        is the "tag" that is being tokenized.

         )joinrn   )ro   rp   rp   rq   get_tag  s    zInputStreamWithMemory.get_tagc             C   s   dg| _ dS )zResets stream history to just '<'

        This gets called by tagOpenState() which marks a '<' that denotes an
        open tag. Any time we see that, we reset the buffer.

        <N)rn   )ro   rp   rp   rq   	start_tag)  s    zInputStreamWithMemory.start_tagN)F)__name__
__module____qualname____doc__rr   propertyrs   rt   ru   rv   rz   r   r   r   rp   rp   rp   rq   rj      s   

rj   c                   sT   e Zd ZdZd fdd	Z fddZd fdd		Z fd
dZ fddZ  Z	S )BleachHTMLTokenizerz1Tokenizer that doesn't consume character entitiesFc                s*   t  jf | || _t| j| _d | _d S )N)superrr   consume_entitiesrj   streamemitted_last_token)ro   r   kwargs)	__class__rp   rq   rr   6  s    zBleachHTMLTokenizer.__init__c             #   s6  d }xt   D ]}|d k	r|d dkrh|d tkrh|drhtdd |d  D |d< d }|V  q|d dkr| jjd k	r|d  	 | jjkr| j
 |d< t|d< d }|V  q|d tkr|V  |}q|V  |V  d }q|d tkr|}q|V  qW |r2|d dkr,td| jd	  d
V  n|V  d S )Nr%   z#invalid-character-in-attribute-nametypec             s   s2   | ]*\}}d |krd|krd|kr||fV  qdS )"'r   Nrp   ).0	attr_nameZ
attr_valuerp   rp   rq   	<genexpr>R  s   z/BleachHTMLTokenizer.__iter__.<locals>.<genexpr>z!expected-closing-tag-but-got-charzeof-in-tag-namer   name)r   r%   )r   __iter__TAG_TOKEN_TYPESgetr   itemsparsertagslowerstripr   r   TAG_TOKEN_TYPE_CHARACTERSTAG_TOKEN_TYPE_PARSEERRORcurrentToken)ro   Zlast_error_tokentoken)r   rp   rq   r   A  sD    

zBleachHTMLTokenizer.__iter__Nc                sJ   | j rt ||S |r4| jd d d  d7  < n| jtdd d S )Nr%   r      &)r   r%   )r   r   consumeEntityr   
tokenQueuerw   r   )ro   ZallowedCharZfromAttribute)r   rp   rq   r     s
    z!BleachHTMLTokenizer.consumeEntityc                s   | j   t  S )N)r   r   r   tagOpenState)ro   )r   rp   rq   r     s    
z BleachHTMLTokenizer.tagOpenStatec                s   | j }| jjd k	r|d tkr|d  | jjkr| jjrh| jrb|d tkrb|d  tkrbd}qrd}n
| j	
 }t|d}| | _ | _| j| | j| _d S | j | _t   d S )Nr   r   
r   )r   r%   )r   r   r   r   r   r   r   TAG_TOKEN_TYPE_STARTHTML_TAGS_BLOCK_LEVELr   r   r   r   rw   Z	dataStatestater   emitCurrentToken)ro   r   Znew_dataZ	new_token)r   rp   rq   r     s$    

z$BleachHTMLTokenizer.emitCurrentToken)F)NF)
r   r   r   r   rr   r   r   r   r   __classcell__rp   rp   )r   rq   r   3  s   Tr   c                   s*   e Zd ZdZ fddZd	ddZ  ZS )
BleachHTMLParserz$Parser that uses BleachHTMLTokenizerc                s>   |dk	rt dd |D nd| _|| _|| _t jf | dS )a  
        :arg tags: set of allowed tags--everything else is either stripped or
            escaped; if None, then this doesn't look at tags at all
        :arg strip: whether to strip disallowed tags (True) or escape them (False);
            if tags=None, then this doesn't have any effect
        :arg consume_entities: whether to consume entities (default behavior) or
            leave them as is when tokenizing (BleachHTMLTokenizer-added behavior)

        Nc             s   s   | ]}|  V  qd S )N)r   )r   tagrp   rp   rq   r     s    z,BleachHTMLParser.__init__.<locals>.<genexpr>)	frozensetr   r   r   r   rr   )ro   r   r   r   r   )r   rp   rq   rr     s     zBleachHTMLParser.__init__Fr+   Tc             K   sj   || _ || _|| _tf || j| d|| _|   y|   W n$ tk
rd   |   |   Y nX d S )N)r   r   r   )	ZinnerHTMLMode	container	scriptingr   r   Z	tokenizerrl   ZmainLoopReparseException)ro   r   Z	innerHTMLr   r   r   rp   rp   rq   _parse  s    	zBleachHTMLParser._parse)Fr+   T)r   r   r   r   rr   r   r   rp   rp   )r   rq   r     s   r   c             C   s   | d dkrt | dk rdS | d dkr<| dd d }}n| dd d }}|d	krZdS t||}d|  k rxd
k rn nt|S dS t| dS )a9  Convert an entity (minus the & and ; part) into what it represents

    This handles numeric, hex, and text entities.

    :arg value: the string (minus the ``&`` and ``;`` part) to convert

    :returns: unicode character or None if it's an ambiguous ampersand that
        doesn't match a character entity

    r   #   Nr   )xX   
   r   i   )lenintchrENTITIESr   )valueZint_as_stringr   
code_pointrp   rp   rq   convert_entity  s    
r   c             C   s   d| kr| S g }xxt | D ]l}|s$q|dr|t|}|dk	r|t|}|dk	r||| |t|d d }|r|| q|| qW d|S )zConverts all found entities in the text

    :arg text: the text to convert entities in

    :returns: unicode text with converted entities

    r   Nr   r   )next_possible_entity
startswithmatch_entityr   rw   r   r   )textnew_textpartentityZ	converted	remainderrp   rp   rq   convert_entities*  s$    


r   c             C   s"  | d dkrt d| dd } t| } d}dtj }| r| d dkrd}| d | rx| d d	krxd
}|| d7 }nd}x0| r| d |kr| d}||krP ||7 }q~W |r| r| d dkr|S dS x4| r| d |kr| d}||7 }t|sdS qW |r| r| d dkr|S dS )av  Returns first entity in stream or None if no entity exists

    Note: For Bleach purposes, entities must start with a "&" and end with a
    ";". This ignores ambiguous character entities that have no ";" at the end.

    :arg stream: the character stream

    :returns: the entity string without "&" or ";" if it's a valid character
        entity; ``None`` otherwise

    r   r   zStream should begin with "&"r   Nr   z<&=;r   )r   r   Z0123456789abcdefABCDEF
0123456789;)
ValueErrorr|   string
whitespacer   ENTITIES_TRIEZhas_keys_with_prefix)r   Zpossible_entityZend_charactersallowedrx   rp   rp   rq   r   M  s:    




r   z(&)c             c   sF   x@t t| D ].\}}|dkr(|V  q|d dkrd| V  qW dS )zTakes a text and generates a list of possible entities

    :arg text: the text to look at

    :returns: generator where each part (except the first) starts with an
        "&"

    r   r   r   N)	enumerateAMP_SPLIT_REsplit)r   r>   r   rp   rp   rq   r     s
    	r   c                   s.   e Zd ZdZdZdd Zd fdd	Z  ZS )	BleachHTMLSerializerz[HTMLSerializer that undoes & -> &amp; in attributes and sets
    escape_rcdata to True
    Tc             c   s   | dd}xxt|D ]l}|s q|drtt|}|dk	rtt|dk	rtd| dV  |t|d d }|r|V  q| ddV  qW dS )z,Escapes just bare & in HTML attribute valuesz&amp;r   Nr   r   )replacer   r   r   r   r   )ro   stokenr   r   rp   rp   rq   escape_base_amp  s    
z$BleachHTMLSerializer.escape_base_ampNc             #   s   d}d}xtt  ||D ]b}|rf|dkr.d}n0|rR|dkr^| |E dH  d}qn|dkr^d}|V  q|drtd}|V  qW dS )zWrap HTMLSerializer.serialize and conver & to &amp; in attribute values

        Note that this converts & to &amp; in attribute values where the & isn't
        already part of an unambiguous character entity.

        F>r   N=Tr   )r   	serializer   r   )ro   Z
treewalkerencodingin_tagZafter_equalsr   )r   rp   rq   r     s"    
zBleachHTMLSerializer.serialize)N)r   r   r   r   Zescape_rcdatar   r   r   rp   rp   )r   rq   r     s   r   )9r   rer   warningsfilterwarningsDeprecationWarningZbleach._vendor.html5libr   r   r   Z!bleach._vendor.html5lib.constantsr	   r
   r   r   Z$bleach._vendor.html5lib.filters.baser   Z)bleach._vendor.html5lib.filters.sanitizerr   r   r   r   r   r   ZSanitizerFilterZ$bleach._vendor.html5lib._inputstreamr   Z"bleach._vendor.html5lib.serializerr   r   Z"bleach._vendor.html5lib._tokenizerr   r   Zbleach._vendor.html5lib._trier   entitiesr   r   Z
tokenTypesr   r   ZTAG_TOKEN_TYPE_ENDr   r   r   Z	HTML_TAGSr   rj   r   r   r   r   r   compiler   r   r   rp   rp   rp   rq   <module>   sT    



w%? *,"#<
