ó
ÖÖíVc           @   s^  d  d l  m Z d  d l m Z d  d l Z d  d l Z d  d l Z d  d l	 Z	 d  d l
 Z
 d  d l Z d  d l Z d  d l Z d  d l m Z d  d l m Z m Z d  d l Z d  d l m Z m Z d  d l Z d  d l Z d  d l Z d  d l Z d  d l Z d  d l Td  d l Z d  d l Z d „  Z g  d	 „ Z  d
 „  Z! e" d k rZe ƒ  n  d S(   iÿÿÿÿ(   t   Counter(   t   KMeansN(   t   WordNetLemmatizer(   t   TextBlobt   Word(   t   TfidfTransformert   CountVectorizer(   t   *c       
   C   sS  d }  t  j d |  d t  j ƒ t  j ƒ  } | j t  j ƒ t  j d d ƒ } | j d j | ƒ t j	 d d ƒ } | j
 d	 d
 d d d d d d t ƒt j ƒ  } | j ƒ  j } | j d ƒ d d } t | d ƒ j ƒ  } d j | ƒ g } | j d d ƒ | j d | ƒ | j d d ƒ t | | | ƒ | j d t j ƒ  | ƒ d  S(   Ns   log_vec2topic.logt   filenamet   levels   %(asctime)s %(message)ss   %b-%d-%Y %H:%M:%Si    t   descriptions   Run Vec2Topic on a text files   -it   actiont   storet   destt   inpt   helps   text file inputedt   requiredt   .s   .csvt   rbt    R   i2   s   Running Vec2Topic on %ss   Total time: %.2f seconds(   t   loggingt   basicConfigt   INFOt	   getLoggert   setLevelt	   Formattert   handlerst   setFormattert   argparset   ArgumentParsert   add_argumentt   Truet   timet
   parse_argsR   t   splitt   opent	   readlinest   joint   infot	   vec2topic(   t   LOG_FILENAMEt   loggert	   formattert   parsert   time1t   filert   outfilet   text(    (    s   vec2topic.pyt   main   s$    %c         C   së  t  j d ƒ } y t |  d d ƒ}  Wn | j d ƒ n X| j d |  ƒ } t  j d d | ƒ } t  j d d | ƒ } t  j d	 d
 | ƒ } t  j d d | ƒ } t  j d d | ƒ } t  j d d | ƒ } t  j d d | ƒ } t  j d d | ƒ } t  j d d | ƒ } t  j d d | ƒ } t  j d d | ƒ } t  j d d | ƒ } t  j d d | ƒ } t  j d d | ƒ } t  j d d | ƒ } t  j d d | ƒ } t j | ƒ } g  | D]- } t | ƒ d k r´d j | j	 ƒ  ƒ ^ q´} | S(   Ns   [\w\.-]+@[\w\.-]+t   errorst   ignores   unicode errort    s   [^ -]+R   s   \rs   \ns   . s   \ts   (?:\@|https?\://)\S+s   \d*.\d+s   [.]{2,}R   s   \d+.\d+s   [~/<>()_=-]s   's   "s   [,:\*!#%/$+\^]s   [`\[\]\{\}\|]s   \\s   \b\d+\bs    +i   (
   t   ret   compilet   unicodeR&   t   subt   nltkt   sent_tokenizet   lenR%   R"   (   t   inputStringR)   t	   namesListt   email_patternt   outputStringt   outputSentencest   w(    (    s   vec2topic.pyt   cleanupContent/   s2    :c   U         sv  g  |  D] } t  j d d | ƒ ^ q }  t ƒ  } g  |  D]. } g  | j ƒ  D] } | j | ƒ ^ qK ^ q8 } g  | D]% } g  | D] } | j ƒ  ^ q€ ^ qs } | j d ƒ t j d ƒ }	 t	 j
 t d d ƒ ƒ \ }
 } | j d ƒ t j j j | ƒ } t | | ƒ } t t g  | D]% } | D] } d | k r!| ^ q!qƒ ƒ } g  | D] } t  j d d	 | ƒ ^ qO} g  | D] } | D] } | ^ qqw} t | ƒ } g  t t | ƒ ƒ D]$ } | | | d
 k rµ| | ^ qµ} t | ƒ j | ƒ } g  } xê | D]â } g  } xÆ | D]¾ } d | k r6| j | ƒ q| | k ra| j t  j d d	 | ƒ ƒ q| | k r}| j | ƒ qt  j d d | ƒ } | j d ƒ d
 } | j d ƒ d } | j | ƒ | j | ƒ qW| j | ƒ qW| } | j d ƒ g  } xl | D]d } g  } t d j | ƒ ƒ } x3 | j D]( \ } } | d= k r2| j | ƒ q2q2W| j | ƒ qW| j d ƒ d }  d }! t j j j | d d d |! d d d d ƒ}" | j d ƒ | j d ƒ g  | D] }# |# D] } | ^ qàqÖ} t | ƒ }$ t |
 ƒ j |" j ƒ j |$ ƒ }% i  }& g  }' x` |% D]X } t | ƒ d k r5t j t j |	 t  | ƒ |" | f ƒ ƒ |& | <|' j | ƒ q5q5W| }( g  | D] }# |# D] } | ^ q¨qž} t | ƒ }$ | j d ƒ t! |& |' d> | d t" d t# d d
 d  d
 ƒ\ }) ‰
 t$ j% |) d! d" d# d$ ƒ}* t |) ƒ }+ t& |* ˆ
 |+ ƒ }, | j d% ƒ g  | D] } d j | ƒ ^ qV}- t' d& d' ƒ }. |. j( |- ƒ }/ i  ‰ x' |. j) j* ƒ  D] \ }0 }1 |0 ˆ |1 <q¥Wg  }2 x9 |. j) j+ ƒ  D]( \ }0 }1 |0 |' k rÕ|2 j |1 ƒ qÕqÕWt, |2 ƒ ‰	 |/ d  d  … ˆ	 f }3 t- ƒ  j( |3 ƒ }4 |3 j. |3 }5 | j d( ƒ |5 j/ \ }6 }7 |5 j0 ƒ  }8 g  t t ˆ	 ƒ ƒ D] } ˆ ˆ	 | ^ q}}9 g  }: g  }; g  }< x† t1 j2 |8 j3 |8 j4 |8 j5 ƒ D]f \ }= }> }? |? d
 k rÈ|= |> k rÈd }1 |1 d
 k r.|; j |= ƒ |< j |> ƒ |: j |1 ƒ q.qÈqÈWt6 j7 j8 |: |; |< f f d) |6 |7 f ƒ}@ |@ j9 ƒ  }A ‡ ‡	 f d* †  t t ˆ	 ƒ ƒ Dƒ }B d+ „  |B j* ƒ  Dƒ }C | j d, ƒ |A j: d- d ƒ }D i  ‰ x6 t t ˆ	 ƒ ƒ D]" }E t; |D |E ƒ ˆ ˆ ˆ	 |E <qÛWt< ˆ j= ƒ  ƒ ‰ t< |, j= ƒ  ƒ ‰ ‡ ‡ f d. †  ˆ j> ƒ  Dƒ }F d ‰  ‡  f d/ †  |F j+ ƒ  Dƒ ‰ ‡ f d0 †  |, j+ ƒ  Dƒ } d ‰  ‡  f d1 †  | j+ ƒ  Dƒ ‰ ‡ ‡ f d2 †  ˆ j> ƒ  Dƒ } t j< | j= ƒ  ƒ ‰ ‡ f d3 †  | j+ ƒ  Dƒ }G | j d4 ƒ d5 }H t? d6 |H ƒ ‰ ˆ j@ g  |) D] } | ^ q#	ƒ ‡ ‡
 f d7 †  t t ˆ
 ƒ ƒ Dƒ }I i  }J g  t |H ƒ D] }= g  ^ qn	}K g  t |H ƒ D] }= g  ^ q	}L x– t |H ƒ D]ˆ }= g  t t ˆ
 ƒ ƒ D]# }E ˆ jA |E |= k rÅ	ˆ
 |E ^ qÅ	|K |= <g  |K |= D] } |G | ^ qý	}M t jB t, |M d8 t# ƒƒ |L |= <q¬	Wt jC |L ƒ }N xC tD t |H ƒ ƒ D]/ \ }O }= x  |K |N |= D] } |= |J | <qu
WqZ
W| j d9 | ƒ g  }P x” t d5 ƒ D]† }= |K |N |= }Q g  t, g  |Q D] } | |G | g ^ qÒ
d: tE d ƒ d8 t# ƒD] } | d
 ^ q}R |P j d; |= d g |R d  ƒ q±
WtF |P Œ  } t | d< ƒ }S tG jH |S ƒ }T |T jI | ƒ t# S(?   Ns   [^\w]R   s   Reading wiki vecss   /data/wikimodel/wiki.shelves   /data/wikimodel/wiki.pklR   s   Running Bigramst   _R3   i    i   s   Extracting Nounst   NNt   NNPt   NNSt   NNPSs   Word2Vec training starting...i,  i   t   workersi   t   sizet   iteri2   t	   min_counti   s   Word2Vec training complete...s   Creating word vecss   Clustering for depth...t   repeatt
   normalizedt   l2_thresholdt   methodt   averaget   metrict   cosines   Computing co-occurence grapht   token_patternu   (?u)\b([^\s]+)s   Computing degreet   shapec            s!   i  |  ] } ˆ  ˆ | | “ q S(    (    (   t   .0R@   (   t   id2wordt   sort_ids(    s   vec2topic.pys
   <dictcomp>æ   s   	 c         S   s   i  |  ] \ } } | | “ q S(    (    (   RT   t   keyt   value(    (    s   vec2topic.pys
   <dictcomp>ç   s   	 s   Computing metricst   axisc            s;   i  |  ]1 } t  j d  ˆ  | ƒ t  j d  ˆ ƒ | “ q S(   i   (   t   npt   log(   RT   R@   (   t   degt   max_deg(    s   vec2topic.pys
   <dictcomp>ó   s   	 c            s#   i  |  ] \ } } | ˆ  | “ q S(    (    (   RT   RW   RX   (   t   alpha(    s   vec2topic.pys
   <dictcomp>õ   s   	 c            s'   i  |  ] \ } } | d  ˆ  | “ q S(   g      ð?(    (   RT   RW   RX   (   t	   max_depth(    s   vec2topic.pys
   <dictcomp>÷   s   	 c            s#   i  |  ] \ } } | ˆ  | “ q S(    (    (   RT   RW   RX   (   R^   (    s   vec2topic.pys
   <dictcomp>ù   s   	 c            s%   i  |  ] } ˆ  | ˆ | | “ q S(    (    (   RT   RW   (   t   deg_modt	   depth_mod(    s   vec2topic.pys
   <dictcomp>û   s   	 c            s'   i  |  ] \ } } | d  ˆ  | “ q S(   g      ð?(    (   RT   RW   RX   (   t
   max_metric(    s   vec2topic.pys
   <dictcomp>ý   s   	 s   Running K-meansi
   t
   n_clustersc            s$   i  |  ] } ˆ  j  | ˆ | “ q S(    (   t   labels_(   RT   t   x(   t   kmeanst   word_d2v(    s   vec2topic.pys
   <dictcomp>  s   	 t   reverses   Done...Writing output to %sRW   s   Topic %dt   wb(   RC   RD   RE   RF   iE  (J   R4   R7   R   R"   t	   lemmatizet   lowerR&   t   shelveR#   t   cPicklet   loadt   gensimt   modelst   phrasest   Phrasest   listt   setR    t   xrangeR:   t   intersectiont   appendt	   partitionR   R%   t   tagst   word2vect   Word2Vect   vocabRZ   t   arrayt   concatenatet   strt   create_word_listt   FalseR   t   fastclustert   linkaget   calculate_depthR   t   fit_transformt   vocabulary_t   itemst	   iteritemst   sortedR   t   TRS   t   tocoot	   itertoolst   izipt   rowt   colt   datat   spt   sparset
   coo_matrixt   tocsct   sumt   intt   maxt   valuest   iterkeysR   t   fitRd   t   meant   argsortt	   enumeratet
   itemgettert   zipt   csvt   writert	   writerows(U   t
   inpContentR)   R.   t   tempt   wordnet_lemmatizert   XR@   t   sentences_lemRB   t   model_wiki_vect   model_wiki_vocab_lowercaset   wiki_bigram_wordt   bigramt   sentences_bigramst   bigram_listt   bigram_joinedt   wordst   bigram_freqt   to_joint   wiki_bigram_word_commont   sentences_bigrammed_tempt   sentt   new_sentt   new_wt   new_w1t   new_w2t   sentences_bigrammedt   sentences_nounst   nounst   blobt   wordt   tagt   dim_wikit   dim_datat   model_wR/   t   Vocabt   common_vocabt
   model_combt   model_comb_vocabt	   sentencest   data_d2vt	   spclustert
   num_pointst   depthRŠ   t   cvt
   bow_matrixRW   RX   t   idst   bow_reducedRL   t   similarity_graph_reducedt   mt   nt   cxt   keyzR   t   rot   cot   it   jt   vt   SSt   SP_fullt   id_wordt   word_idt   degsumRe   t   temp_deg_modRP   t   Kt   kmeans_labelt   kmeans_label_rankedt   topict   clust_deptht
   temp_scoret   indext   numt   listert   topt   sort_topt   bt   a(    (   R^   R\   R`   Ra   RU   Rf   R]   R_   Rb   RV   Rg   s   vec2topic.pyR'   K   s   (	;2>(&=-&!0&3"-.*%  %@!%R$t   __main__(#   t   collectionsR    t   sklearn.clusterR   t   numpyRZ   t   scipyR‘   RŒ   t   jsonR   Ro   R4   R8   t	   nltk.stemR   t   textblobR   R   Rm   t   sklearn.feature_extraction.textR   R   R   R    t   sklearnR‚   t   networkxt   utilsRl   R    R0   RA   R'   t   __name__(    (    (    s   vec2topic.pyt   <module>   s.   
		Ú