From b31973d514d12a7b7541049215440810de774741 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Wed, 16 Aug 2006 11:54:54 -0400 Subject: [PATCH] Initial load --- examples/images/feed-icon-10x10.png | Bin 0 -> 469 bytes examples/images/logo.png | Bin 0 -> 5413 bytes examples/planet.css | 150 + examples/planet.xslt | 65 + planet/BeautifulSoup.py | 1824 ++++++++ planet/__init__.py | 45 + planet/compat_logging/__init__.py | 1196 ++++++ planet/compat_logging/config.py | 299 ++ planet/compat_logging/handlers.py | 728 ++++ planet/config.py | 112 + planet/feedparser.py | 3656 +++++++++++++++++ planet/reconstitute.py | 195 + planet/spider.py | 86 + planet/splice.py | 46 + planet/timeoutsocket.py | 424 ++ runtests.py | 11 + spider.py | 20 + splice.py | 21 + tests/__init__.py | 0 tests/data/config/basic.ini | 13 + tests/data/reconstitute/author_email.xml | 13 + tests/data/reconstitute/author_name.xml | 13 + tests/data/reconstitute/author_uri.xml | 13 + tests/data/reconstitute/content_html.xml | 10 + .../reconstitute/content_illegal_char.xml | 10 + tests/data/reconstitute/content_lang.xml | 10 + tests/data/reconstitute/content_tag_soup.xml | 10 + tests/data/reconstitute/content_text.xml | 10 + tests/data/reconstitute/content_xhtml.xml | 13 + tests/data/reconstitute/contributor_email.xml | 13 + tests/data/reconstitute/contributor_name.xml | 13 + tests/data/reconstitute/contributor_uri.xml | 13 + tests/data/reconstitute/id.xml | 11 + tests/data/reconstitute/id_only_content.xml | 13 + .../data/reconstitute/id_only_description.xml | 13 + tests/data/reconstitute/id_only_link.xml | 13 + tests/data/reconstitute/id_only_title.xml | 13 + tests/data/reconstitute/link_href.xml | 11 + tests/data/reconstitute/link_rel.xml | 11 + tests/data/reconstitute/link_type.xml | 11 + tests/data/reconstitute/published.xml | 11 + tests/data/reconstitute/rights.xml | 11 + tests/data/reconstitute/source_author.xml | 12 + .../data/reconstitute/source_contributor.xml | 12 + tests/data/reconstitute/source_icon.xml | 10 + tests/data/reconstitute/source_id.xml | 10 + tests/data/reconstitute/source_link.xml | 10 + tests/data/reconstitute/source_logo.xml | 10 + tests/data/reconstitute/source_rights.xml | 10 + tests/data/reconstitute/source_subtitle.xml | 10 + tests/data/reconstitute/source_title.xml | 10 + tests/data/reconstitute/source_updated.xml | 10 + tests/data/reconstitute/summary_html.xml | 10 + tests/data/reconstitute/summary_lang.xml | 10 + tests/data/reconstitute/summary_text.xml | 10 + tests/data/reconstitute/summary_xhtml.xml | 13 + tests/data/reconstitute/title_html.xml | 10 + tests/data/reconstitute/title_lang.xml | 10 + tests/data/reconstitute/title_text.xml | 10 + tests/data/reconstitute/title_xhtml.xml | 13 + tests/data/reconstitute/updated.xml | 11 + tests/data/spider/config.ini | 12 + tests/data/spider/testfeed1a.atom | 49 + tests/data/spider/testfeed1b.atom | 50 + tests/data/spider/testfeed2.atom | 49 + tests/data/spider/testfeed3.rss | 37 + ...g:planet.intertwingly.net,2006:testfeed1,1 | 22 + ...g:planet.intertwingly.net,2006:testfeed1,2 | 23 + ...g:planet.intertwingly.net,2006:testfeed1,3 | 22 + ...g:planet.intertwingly.net,2006:testfeed1,4 | 22 + ...g:planet.intertwingly.net,2006:testfeed2,1 | 22 + ...g:planet.intertwingly.net,2006:testfeed2,2 | 22 + ...g:planet.intertwingly.net,2006:testfeed2,3 | 22 + ...g:planet.intertwingly.net,2006:testfeed2,4 | 22 + tests/data/splice/config.ini | 11 + tests/test_config.py | 52 + tests/test_reconstitute.py | 37 + tests/test_spider.py | 65 + tests/test_splice.py | 17 + 79 files changed, 9907 insertions(+) create mode 100644 examples/images/feed-icon-10x10.png create mode 100644 examples/images/logo.png create mode 100644 examples/planet.css create mode 100644 examples/planet.xslt create mode 100644 planet/BeautifulSoup.py create mode 100644 planet/__init__.py create mode 100644 planet/compat_logging/__init__.py create mode 100644 planet/compat_logging/config.py create mode 100644 planet/compat_logging/handlers.py create mode 100644 planet/config.py create mode 100755 planet/feedparser.py create mode 100644 planet/reconstitute.py create mode 100644 planet/spider.py create mode 100644 planet/splice.py create mode 100644 planet/timeoutsocket.py create mode 100755 runtests.py create mode 100644 spider.py create mode 100644 splice.py create mode 100644 tests/__init__.py create mode 100644 tests/data/config/basic.ini create mode 100644 tests/data/reconstitute/author_email.xml create mode 100644 tests/data/reconstitute/author_name.xml create mode 100644 tests/data/reconstitute/author_uri.xml create mode 100644 tests/data/reconstitute/content_html.xml create mode 100644 tests/data/reconstitute/content_illegal_char.xml create mode 100644 tests/data/reconstitute/content_lang.xml create mode 100644 tests/data/reconstitute/content_tag_soup.xml create mode 100644 tests/data/reconstitute/content_text.xml create mode 100644 tests/data/reconstitute/content_xhtml.xml create mode 100644 tests/data/reconstitute/contributor_email.xml create mode 100644 tests/data/reconstitute/contributor_name.xml create mode 100644 tests/data/reconstitute/contributor_uri.xml create mode 100644 tests/data/reconstitute/id.xml create mode 100644 tests/data/reconstitute/id_only_content.xml create mode 100644 tests/data/reconstitute/id_only_description.xml create mode 100644 tests/data/reconstitute/id_only_link.xml create mode 100644 tests/data/reconstitute/id_only_title.xml create mode 100644 tests/data/reconstitute/link_href.xml create mode 100644 tests/data/reconstitute/link_rel.xml create mode 100644 tests/data/reconstitute/link_type.xml create mode 100644 tests/data/reconstitute/published.xml create mode 100644 tests/data/reconstitute/rights.xml create mode 100644 tests/data/reconstitute/source_author.xml create mode 100644 tests/data/reconstitute/source_contributor.xml create mode 100644 tests/data/reconstitute/source_icon.xml create mode 100644 tests/data/reconstitute/source_id.xml create mode 100644 tests/data/reconstitute/source_link.xml create mode 100644 tests/data/reconstitute/source_logo.xml create mode 100644 tests/data/reconstitute/source_rights.xml create mode 100644 tests/data/reconstitute/source_subtitle.xml create mode 100644 tests/data/reconstitute/source_title.xml create mode 100644 tests/data/reconstitute/source_updated.xml create mode 100644 tests/data/reconstitute/summary_html.xml create mode 100644 tests/data/reconstitute/summary_lang.xml create mode 100644 tests/data/reconstitute/summary_text.xml create mode 100644 tests/data/reconstitute/summary_xhtml.xml create mode 100644 tests/data/reconstitute/title_html.xml create mode 100644 tests/data/reconstitute/title_lang.xml create mode 100644 tests/data/reconstitute/title_text.xml create mode 100644 tests/data/reconstitute/title_xhtml.xml create mode 100644 tests/data/reconstitute/updated.xml create mode 100644 tests/data/spider/config.ini create mode 100644 tests/data/spider/testfeed1a.atom create mode 100644 tests/data/spider/testfeed1b.atom create mode 100644 tests/data/spider/testfeed2.atom create mode 100644 tests/data/spider/testfeed3.rss create mode 100644 tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed1,1 create mode 100644 tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed1,2 create mode 100644 tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed1,3 create mode 100644 tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed1,4 create mode 100644 tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed2,1 create mode 100644 tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed2,2 create mode 100644 tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed2,3 create mode 100644 tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed2,4 create mode 100644 tests/data/splice/config.ini create mode 100644 tests/test_config.py create mode 100644 tests/test_reconstitute.py create mode 100644 tests/test_spider.py create mode 100644 tests/test_splice.py diff --git a/examples/images/feed-icon-10x10.png b/examples/images/feed-icon-10x10.png new file mode 100644 index 0000000000000000000000000000000000000000..cc869bc61785f4db646fcbbcfc87aa3d20d99eba GIT binary patch literal 469 zcmV;`0V@89P)b#`aw-kX_Si^Jc1|2c;v&L+N%#GTkbr^vx_L@0?2ue1vae8uy9 zW>j2Fwi~;wnv#w|%)>D{wT>10d6+ znvjX&#K$e}$~4lwrKocpr#p$RZpK^viI-7mZAGBOZfK!ocn0%!gWCL!dO5}Fox+@M z<8LitMCck7=WdtW+z{sJ1iSwiD)WlBvkT!CKn3HAn`5Jatl8=pf zWMv()t_|gz0w^mJ$i`l18o*uui>ycxWHsvP8s|%U9u%*Cx=p;;psT*)T^`{*rxCTS zWX}(wG=ZN^W3ms(Xv}CQKedl?b7Aoq{>2_>AN82ZL&^z8{|hhxfn}MuvYci literal 0 HcmV?d00001 diff --git a/examples/images/logo.png b/examples/images/logo.png new file mode 100644 index 0000000000000000000000000000000000000000..f277bf9281a3659dd27aaaf6822e01240cc48cd0 GIT binary patch literal 5413 zcmV+=724{FP)V=f?8F)lDMe$ayr000z}Nkl4mzf#6qb_jOdU?V18fcGzk>>eZ2R9^YnVdnzq$qU*J^?$>=2h4VA2BmZ$zH`4h(V`BF0Y> z)dx)(x(cWoK>oMwet;ONf+P;^YtL~jirl~k0zXiJzCNfgoGZL79e}2*=>L#2MMTwY zm@GXG$yol3lj-YIQrQg;r>-DaCxD_nQ#ORH?f8KSua##Feg>%f`kWOT)_)xPb3rj* zL9*jB1PO>Jx!TBjg=y>=qGSG0nVT zXcI3A<#?z=jhrw?4~- z4u4Sdd0{r;@o8NNfUDTlyBvlxkoy^gELiU15J^B#^>PQuH@Jx6e7)OdQBMt&N&}cb zC_ae+E(LH+kD#t&*nY;+Br56iL~y@bFzj1Cm<-JQJpB8HxuD!#QA92Wo4BTwh!)@( z^JkQ2(lmSykHsh{LbVC}QA^dWUp&Y@% zEoU<89OUvLWPmX{6Cf}PkP3^l>(5f&yH@^kjiH?f07-S{(+Xs#FmRrTf80fr?j!CH z7Lpz>LFYT6I>fY@ev8JijpLm4x>NQ)Dk#05{0P6<4=fh^S|CFg%s>>;1lUla{YH(U zow0FoP|0amstmxy?sMwhaO5M`2|K{hkr?DdEy9zr83sS}`!oj1R_3E;1}@d_fB@RQ z^#{03CqX(hF!#Q{W|&Qytl^xS`lX*znaAr@MSMna6Bmd2IHN}drgHee;K=N&jeg2oX8z6K@<_X87Jo3cSda zbsBygq3kX=FwX)cH*$LoErQu5A>b!6g7Hd>01<+5Fb@(26974s5Df8ninqC+U$0Ua z?(2dSZ)XN#hPEpDi*QERUPmB$?idb}pfU|Ll=^6a0T6Hmfp;D_U$0V_?B|JZ$8z%_ zvXXFrlxiKh+k7tE&-ypjmd`=6@-|reSU*KDySFDkL0Ayy4l~nLfuv#OS$XVm5Lf8{ z{fD*tKOH3oRcC~gWa9mP09%)PE6|fzOeEj0;NRlQVQy3ac8d*=^!%y09Qwoa(+E#JV=z6*{2ZR8 z$6#(udILm>_aRwli8Pz1GLx57NN~VAGQdLjWU^0nz)+tpnNtNrI(lui{m&KQoWj0T zAUwf@-AZ&&78qFvu=8T`4JeL8M7)w#8WO!j2qTqV)kOru5DXsrU2tVaJ=>1^?#9Eh zY2U#UC4ZK>{GBUcK}xWNl#%?8v7z9W^Te~}G{LBYFg6v8#^_GxWXlj%*c=VN8w@32 zzI!)eNLk5Y)qCEIWt_t~A!pCFT7e-_*4Naxd?IKPwQ{G=dE}L7~ANfUv0%silTcQfg^AgBUgZjPl&f# zNfw^Wj~a$Z?sWNhpgJyiUDWIZQ%3P8(6X~!6VOU|Q3L*{_(ImyHRXH=ONbR=v@UJR zh%$Z?Ry5yhL~{x%6Ld0mKH~AYSP)DSM=~$)jlEc=%3!U-O%s>}+A&dNISb#`!kT}+euKR8#7{9Mst=qXi1SB)}M42vZ7MlpXwEysqTyp5*H3Mn$&l439~OD zn64}=svNtmo)3@E(}IBrE*XIA%nm1c$skvRsK5epi#UUI=B_=#fw_BK*39GI4Vlsx zGZH*K#c)Jpa>;yU?fC~HV&AT<{+#Oj8D+`DMNfC)-Xw);@{&)t#1k)C}X zE*Cb4vz9?!a7w1EzmbfHYpx21fDx^J@bAcYq_r~mcnB>iq}#aL{!zvTyfA0>JlbEr zE<#pJm11Vm{mO4d1YUWq=ZV}jDF*wlC~zI?B9A&>q_l@b6r0MnJ3XTi~Iig+y$7t8ebe1}RtgM_NhuI^jqsr1k(Q^|RSi!3aW(J>#A{4;I&a5)J7Yvng zj!ddE3?usiXP?O#q8>DaREn{M-~dXvRq%o1IJ$B&#wxc;Gvum-)f{)urb0=x;Y~n4 zmOaY0FfT1cSqMP28Wt4YtXDEAi)J5MhLR~WNenDI<+GDsfz!~60>ti z%t#(H#F`wKZQD~{!w@^ro-+vPP%dF07S?K8$L*YVHo4;{=kNye+`@;-0cVsRfR`RJ zkJIE5k+P9kWMfEM!Mc2f2F>cUB*#XYIAi4)+HNHIQCM~mtSlao_yk;3Fig+mTm8@k zyFR4Wmzl9wKC!q{rcjQnCh_futSdWTTw8t`Dnn}K*0RlKY0+ce9-6fxVc z`p4$5f*0195a;StIRpr?NuoDA7+7@5iaWHsI^wV}Gy76qK?%Vb&t@1RGU47^J0Qv) z@1*|y)y)|GO)&6wJIr;bN45Z{O&SX$*U+jn3|24%_c%m$cyjoxOqe&P!KDrC?0v5Wij?2MZhaN8G`p^0CHr5l<~;Io97dX?C95T>Gg90Dnnu}W3Z&9|0F=P zg*voUTN|9Tiqzt|B@$YIcA4`0bHDlZ#O3K+S+Fgdtd#k&CMgX)n`YS(ZiFQzeOVnA zhH~;jp<>!OrmDTsL&2*mkh5)2;k38dvXd&x4*Yj|;<0}))xGz$3S=g09Ockje1+#k zD!id&FNHoEK&+^#_Qxun>ip~Ein6C>nG@h5^}m46xymPkBJMq6h^i|hI{rgoN_(+R zP-?`K_KIPkwVi8k?2Gl{W~F8iQtky16E%pKb&h%v4_YyOg!(Bcde!H!=aM5JTKi0! z(w7V7h<#1OZWI%&53_DLvJ22^V%-5Yxlki?jm+pTCi>6(z2kt8mjKi z%wVqmgtnSwg5RPs6T+mmAg#YhQ%n_=uI$L?QyRgU%8ytvMQQsp&%BTeMgtpII!(Vu zFp;>i^+hWS@qLG)rll8pNrWJJ; zg6WMl`=Ps)h~4KqRe)n7#Sah9e7r?0@z~xuMp^+at;<;&d(c3X4d6n)3?lUPB`(0$ z_sesBoyOF>R*sriHAV}l)@C3+R(P#Aglnw$B~aP1>p2EoN_lMtM3J?sM!M{&9YmP9 z&@CGIO+Da~;c~xKW6GsKP37#0MQ@#@R03nH029V>w6eq;LLXV?RP%I?TWprGg;Ysa zsti{*VL0xG@T8{}s(vW$*1R*^a#4*zPkn@L*5yP7%wQNJPCIDi$~i1JGC(mu{ZowC z6*a~hf+D12(i+2xFszLgVi_c(AMg1CHpU0nI7_Z5n6yZu5{yhtv@F(7ZOXpS6(Ghlb7E9XjVW3S6;)@cmo`sKfcRae?Aw+Obs=X>s6LorkUa53E&hlEZDvUTN-PGs`&x zBR1opcD(1)$;`K4o?5Ht(%IWwjYa_d&|!46*vSlERbz2X!e&p2~=jV!%t} zU;Gfk6kwO5M~oyK?@dxdum%UPo6M2!R=GX5VDfn_sHzp!_|lZBH{4m9wa|&V(!Jc; z=9g14{DT)VITd3BjXNZq(Je1fQvxOw|AF-Mw%ih4i$(X*qkPdr^N%%#k(1EK5;Q)D zgCoiZ^8+Vl`Dlz~ctVkj9dtF<^Q@!a-JsbdBL22u?^gI)OK7Ha!VfE6|_L1cb^sp=|jcGF%&ksc{4EFUV zyjMt2S)^51RL+g&y~VfJ2KD8w5~i9tw+!u?E94zxGEEkF8t!bh>&^hp$2kQvRko&S zquJ-WaB0-x<=S5}n(|nLXl84(w7T#5$GU!ki79$7g@JRxU`xBEwotH4Q;7rtZ`> zX-NT_R`+RvOa{xQGE%2!mV-6W*2}`N@ZP|qT$a(4Yf*oiO_>z9dOuq(Y*af)owM|z z#Z*5VPu0|eq{+*g`g-@PmO^J~s8uLu3N@i2FoR_OyuA~b&+@%x@s*jklE&`I!GxNv zR3DLMrn53runsV@GlncqHdAPITVv@&A^kKsR*<2~Up`X2JJ~Ea&~6yiW4kbQ`r5f= zbwc2Z)e)4vrP9U0-LvzpD$U~vPq`5cH9dxZKWIALOojO_8iSFY*`k}-dDN1dg^`E4nb$B zr8+)JcUXbv1Q|USkY$41Sj{{@t`TIKkknBD>Jc!u7-NWe{A8OXOvIhkQ=9gkC|#k7 z>D-<}LY=4H%_?=OeTiVc3T}fFHdvI;=dDkyEUSY?1T#UlY*brIc5sRgV$89a=FXs> zM{Qt&3(ACMWS&h}Vp{i;xh%Q6JiU);TP%xCbe!g|uvU-K-PAjC^V0*g@d;Ox)Iduo zVfI`ybQ%YIon^U*%apHi`l>ovD)E{9w;oxH!ye7=V5=$79hE51&F!=#DISNOzXgzJ zZkLx&qA8|DjYAm0I0P>^U$nxT3)h-(-~3OqTIoy-+krD*eLN!{J5MY$n^~~jgT^k- zW17a5J(l5%z-LI+#NqT28wxVJTo#4m0hv|lsX=1Mv-R-3U}uZCD0e;Kp^qw%H%(tx zuwP9yi)B`GfI2g@DNPxpO-&S!9>*>*LqALd*AcU-xxiKjHch7Mi*Bg7V7( P00000NkvXXu0mjf;zvIF literal 0 HcmV?d00001 diff --git a/examples/planet.css b/examples/planet.css new file mode 100644 index 0000000..05653c0 --- /dev/null +++ b/examples/planet.css @@ -0,0 +1,150 @@ +body { + border-right: 1px solid black; + margin-right: 200px; + + padding-left: 20px; + padding-right: 20px; +} + +h1 { + margin-top: 0px; + padding-top: 20px; + + font-family: "Bitstream Vera Sans", sans-serif; + font-weight: normal; + letter-spacing: -2px; + text-transform: lowercase; + text-align: right; + + color: grey; +} + +.admin { + text-align: right; +} + +h2 { + font-family: "Bitstream Vera Sans", sans-serif; + font-weight: normal; + color: #200080; + + margin-left: -20px; +} + +h3 { + font-family: "Bitstream Vera Sans", sans-serif; + font-weight: normal; + + background-color: #a0c0ff; + border: 1px solid #5080b0; + + padding: 4px; +} + +h3 a { + text-decoration: none; + color: inherit; +} + +h4 { + font-family: "Bitstream Vera Sans", sans-serif; + font-weight: bold; +} + +h4 a { + text-decoration: none; + color: inherit; +} + +img.face { + float: right; + margin-top: -3em; +} + +.entry { + margin-bottom: 2em; +} + +.entry .date { + font-family: "Bitstream Vera Sans", sans-serif; + color: grey; +} + +.entry .date a { + text-decoration: none; + color: inherit; +} + +.sidebar { + position: absolute; + top: 0px; + right: 0px; + width: 200px; + + margin-left: 0px; + margin-right: 0px; + padding-right: 0px; + + padding-top: 20px; + padding-left: 0px; + + font-family: "Bitstream Vera Sans", sans-serif; + font-size: 85%; +} + +.sidebar h2 { + font-size: 110%; + font-weight: bold; + color: black; + + padding-left: 5px; + margin-left: 0px; +} + +.sidebar ul { + padding-left: 1em; + margin-left: 0px; + + list-style-type: none; +} + +.sidebar ul li:hover { + color: grey; +} + +.sidebar ul li a { + text-decoration: none; +} + +.sidebar ul li a:hover { + text-decoration: underline; +} + +.sidebar ul li a img { + border: 0; +} + +.sidebar p { + border-top: 1px solid grey; + margin-top: 30px; + padding-top: 10px; + + padding-left: 5px; +} + +.sidebar .message { + cursor: help; + border-bottom: 1px dashed red; +} + +.sidebar a.message:hover { + cursor: help; + background-color: #ff0000; + color: #ffffff !important; + text-decoration: none !important; +} + +a:hover { + text-decoration: underline !important; + color: blue !important; +} diff --git a/examples/planet.xslt b/examples/planet.xslt new file mode 100644 index 0000000..f240f39 --- /dev/null +++ b/examples/planet.xslt @@ -0,0 +1,65 @@ + + + + + + + <xsl:value-of select="atom:title"/> + + +

+ + + +
+ + + + + + + +

+
+ +

+ + + + — + + + +

+ +
+ + +

+
+ +

+
+
+
+
+ diff --git a/planet/BeautifulSoup.py b/planet/BeautifulSoup.py new file mode 100644 index 0000000..1aec4cd --- /dev/null +++ b/planet/BeautifulSoup.py @@ -0,0 +1,1824 @@ +"""Beautiful Soup +Elixir and Tonic +"The Screen-Scraper's Friend" +http://www.crummy.com/software/BeautifulSoup/ + +Beautiful Soup parses a (possibly invalid) XML or HTML document into a +tree representation. It provides methods and Pythonic idioms that make +it easy to navigate, search, and modify the tree. + +A well-structured XML/HTML document yields a well-behaved data +structure. An ill-structured XML/HTML document yields a +correspondingly ill-behaved data structure. If your document is only +locally well-structured, you can use this library to find and process +the well-structured part of it. + +Beautiful Soup works with Python 2.2 and up. It has no external +dependencies, but you'll have more success at converting data to UTF-8 +if you also install these three packages: + +* chardet, for auto-detecting character encodings + http://chardet.feedparser.org/ +* cjkcodecs and iconv_codec, which add more encodings to the ones supported + by stock Python. + http://cjkpython.i18n.org/ + +Beautiful Soup defines classes for two main parsing strategies: + + * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific + language that kind of looks like XML. + + * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid + or invalid. This class has web browser-like heuristics for + obtaining a sensible parse tree in the face of common HTML errors. + +Beautiful Soup also defines a class (UnicodeDammit) for autodetecting +the encoding of an HTML or XML document, and converting it to +Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed +Parser. + +For more than you ever wanted to know about Beautiful Soup, see the +documentation: +http://www.crummy.com/software/BeautifulSoup/documentation.html +""" +from __future__ import generators + +__author__ = "Leonard Richardson (crummy.com)" +__contributors__ = ["Sam Ruby (intertwingly.net)", + "the unwitting Mark Pilgrim (diveintomark.org)", + "http://www.crummy.com/software/BeautifulSoup/AUTHORS.html"] +__version__ = "3.0.3" +__copyright__ = "Copyright (c) 2004-2006 Leonard Richardson" +__license__ = "PSF" + +from sgmllib import SGMLParser, SGMLParseError +import codecs +import types +import re +import sgmllib + +try: + from htmlentitydefs import name2codepoint +except: + import htmlentitydefs + name2codepoint={} + for (name,codepoint) in htmlentitydefs.entitydefs.iteritems(): + if codepoint.startswith('&#'): codepoint=unichr(int(codepoint[2:-1])) + name2codepoint[name]=ord(codepoint) + +# This RE makes Beautiful Soup able to parse XML with namespaces. +sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') + +# This RE makes Beautiful Soup capable of recognizing numeric character +# references that use hexadecimal. +sgmllib.charref = re.compile('&#(\d+|x[0-9a-fA-F]+);') + +DEFAULT_OUTPUT_ENCODING = "utf-8" + +# First, the classes that represent markup elements. + +class PageElement: + """Contains the navigational information for some part of the page + (either a tag or a piece of text)""" + + def setup(self, parent=None, previous=None): + """Sets up the initial relations between this element and + other elements.""" + self.parent = parent + self.previous = previous + self.next = None + self.previousSibling = None + self.nextSibling = None + if self.parent and self.parent.contents: + self.previousSibling = self.parent.contents[-1] + self.previousSibling.nextSibling = self + + def replaceWith(self, replaceWith): + oldParent = self.parent + myIndex = self.parent.contents.index(self) + if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent: + # We're replacing this element with one of its siblings. + index = self.parent.contents.index(replaceWith) + if index and index < myIndex: + # Furthermore, it comes before this element. That + # means that when we extract it, the index of this + # element will change. + myIndex = myIndex - 1 + self.extract() + oldParent.insert(myIndex, replaceWith) + + def extract(self): + """Destructively rips this element out of the tree.""" + if self.parent: + try: + self.parent.contents.remove(self) + except ValueError: + pass + + #Find the two elements that would be next to each other if + #this element (and any children) hadn't been parsed. Connect + #the two. + lastChild = self._lastRecursiveChild() + nextElement = lastChild.next + + if self.previous: + self.previous.next = nextElement + if nextElement: + nextElement.previous = self.previous + self.previous = None + lastChild.next = None + + self.parent = None + if self.previousSibling: + self.previousSibling.nextSibling = self.nextSibling + if self.nextSibling: + self.nextSibling.previousSibling = self.previousSibling + self.previousSibling = self.nextSibling = None + + def _lastRecursiveChild(self): + "Finds the last element beneath this object to be parsed." + lastChild = self + while hasattr(lastChild, 'contents') and lastChild.contents: + lastChild = lastChild.contents[-1] + return lastChild + + def insert(self, position, newChild): + if (isinstance(newChild, basestring) + or isinstance(newChild, unicode)) \ + and not isinstance(newChild, NavigableString): + newChild = NavigableString(newChild) + + position = min(position, len(self.contents)) + if hasattr(newChild, 'parent') and newChild.parent != None: + # We're 'inserting' an element that's already one + # of this object's children. + if newChild.parent == self: + index = self.find(newChild) + if index and index < position: + # Furthermore we're moving it further down the + # list of this object's children. That means that + # when we extract this element, our target index + # will jump down one. + position = position - 1 + newChild.extract() + + newChild.parent = self + previousChild = None + if position == 0: + newChild.previousSibling = None + newChild.previous = self + else: + previousChild = self.contents[position-1] + newChild.previousSibling = previousChild + newChild.previousSibling.nextSibling = newChild + newChild.previous = previousChild._lastRecursiveChild() + if newChild.previous: + newChild.previous.next = newChild + + newChildsLastElement = newChild._lastRecursiveChild() + + if position >= len(self.contents): + newChild.nextSibling = None + + parent = self + parentsNextSibling = None + while not parentsNextSibling: + parentsNextSibling = parent.nextSibling + parent = parent.parent + if not parent: # This is the last element in the document. + break + if parentsNextSibling: + newChildsLastElement.next = parentsNextSibling + else: + newChildsLastElement.next = None + else: + nextChild = self.contents[position] + newChild.nextSibling = nextChild + if newChild.nextSibling: + newChild.nextSibling.previousSibling = newChild + newChildsLastElement.next = nextChild + + if newChildsLastElement.next: + newChildsLastElement.next.previous = newChildsLastElement + self.contents.insert(position, newChild) + + def findNext(self, name=None, attrs={}, text=None, **kwargs): + """Returns the first item that matches the given criteria and + appears after this Tag in the document.""" + return self._findOne(self.findAllNext, name, attrs, text, **kwargs) + + def findAllNext(self, name=None, attrs={}, text=None, limit=None, + **kwargs): + """Returns all items that match the given criteria and appear + before after Tag in the document.""" + return self._findAll(name, attrs, text, limit, self.nextGenerator) + + def findNextSibling(self, name=None, attrs={}, text=None, **kwargs): + """Returns the closest sibling to this Tag that matches the + given criteria and appears after this Tag in the document.""" + return self._findOne(self.findNextSiblings, name, attrs, text, + **kwargs) + + def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, + **kwargs): + """Returns the siblings of this Tag that match the given + criteria and appear after this Tag in the document.""" + return self._findAll(name, attrs, text, limit, + self.nextSiblingGenerator, **kwargs) + fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x + + def findPrevious(self, name=None, attrs={}, text=None, **kwargs): + """Returns the first item that matches the given criteria and + appears before this Tag in the document.""" + return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs) + + def findAllPrevious(self, name=None, attrs={}, text=None, limit=None, + **kwargs): + """Returns all items that match the given criteria and appear + before this Tag in the document.""" + return self._findAll(name, attrs, text, limit, self.previousGenerator, + **kwargs) + fetchPrevious = findAllPrevious # Compatibility with pre-3.x + + def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs): + """Returns the closest sibling to this Tag that matches the + given criteria and appears before this Tag in the document.""" + return self._findOne(self.findPreviousSiblings, name, attrs, text, + **kwargs) + + def findPreviousSiblings(self, name=None, attrs={}, text=None, + limit=None, **kwargs): + """Returns the siblings of this Tag that match the given + criteria and appear before this Tag in the document.""" + return self._findAll(name, attrs, text, limit, + self.previousSiblingGenerator, **kwargs) + fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x + + def findParent(self, name=None, attrs={}, **kwargs): + """Returns the closest parent of this Tag that matches the given + criteria.""" + # NOTE: We can't use _findOne because findParents takes a different + # set of arguments. + r = None + l = self.findParents(name, attrs, 1) + if l: + r = l[0] + return r + + def findParents(self, name=None, attrs={}, limit=None, **kwargs): + """Returns the parents of this Tag that match the given + criteria.""" + + return self._findAll(name, attrs, None, limit, self.parentGenerator, + **kwargs) + fetchParents = findParents # Compatibility with pre-3.x + + #These methods do the real heavy lifting. + + def _findOne(self, method, name, attrs, text, **kwargs): + r = None + l = method(name, attrs, text, 1, **kwargs) + if l: + r = l[0] + return r + + def _findAll(self, name, attrs, text, limit, generator, **kwargs): + "Iterates over a generator looking for things that match." + + if isinstance(name, SoupStrainer): + strainer = name + else: + # Build a SoupStrainer + strainer = SoupStrainer(name, attrs, text, **kwargs) + results = ResultSet(strainer) + g = generator() + while True: + try: + i = g.next() + except StopIteration: + break + if i: + found = strainer.search(i) + if found: + results.append(found) + if limit and len(results) >= limit: + break + return results + + #These Generators can be used to navigate starting from both + #NavigableStrings and Tags. + def nextGenerator(self): + i = self + while i: + i = i.next + yield i + + def nextSiblingGenerator(self): + i = self + while i: + i = i.nextSibling + yield i + + def previousGenerator(self): + i = self + while i: + i = i.previous + yield i + + def previousSiblingGenerator(self): + i = self + while i: + i = i.previousSibling + yield i + + def parentGenerator(self): + i = self + while i: + i = i.parent + yield i + + # Utility methods + def substituteEncoding(self, str, encoding=None): + encoding = encoding or "utf-8" + return str.replace("%SOUP-ENCODING%", encoding) + + def toEncoding(self, s, encoding=None): + """Encodes an object to a string in some encoding, or to Unicode. + .""" + if isinstance(s, unicode): + if encoding: + s = s.encode(encoding) + elif isinstance(s, str): + if encoding: + s = s.encode(encoding) + else: + s = unicode(s) + else: + if encoding: + s = self.toEncoding(str(s), encoding) + else: + s = unicode(s) + return s + +class NavigableString(unicode, PageElement): + + def __getattr__(self, attr): + """text.string gives you text. This is for backwards + compatibility for Navigable*String, but for CData* it lets you + get the string without the CData wrapper.""" + if attr == 'string': + return self + else: + raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) + + def __unicode__(self): + return __str__(self, None) + + def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): + if encoding: + return self.encode(encoding) + else: + return self + +class CData(NavigableString): + + def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): + return "" % NavigableString.__str__(self, encoding) + +class ProcessingInstruction(NavigableString): + def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): + output = self + if "%SOUP-ENCODING%" in output: + output = self.substituteEncoding(output, encoding) + return "" % self.toEncoding(output, encoding) + +class Comment(NavigableString): + def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): + return "" % NavigableString.__str__(self, encoding) + +class Declaration(NavigableString): + def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): + return "" % NavigableString.__str__(self, encoding) + +class Tag(PageElement): + """Represents a found HTML tag with its attributes and contents.""" + + XML_ENTITIES_TO_CHARS = { 'apos' : "'", + "quot" : '"', + "amp" : "&", + "lt" : "<", + "gt" : ">" + } + # An RE for finding ampersands that aren't the start of of a + # numeric entity. + BARE_AMPERSAND = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)") + + def __init__(self, parser, name, attrs=None, parent=None, + previous=None): + "Basic constructor." + + # We don't actually store the parser object: that lets extracted + # chunks be garbage-collected + self.parserClass = parser.__class__ + self.isSelfClosing = parser.isSelfClosingTag(name) + self.convertHTMLEntities = parser.convertHTMLEntities + self.name = name + if attrs == None: + attrs = [] + self.attrs = attrs + self.contents = [] + self.setup(parent, previous) + self.hidden = False + self.containsSubstitutions = False + + def get(self, key, default=None): + """Returns the value of the 'key' attribute for the tag, or + the value given for 'default' if it doesn't have that + attribute.""" + return self._getAttrMap().get(key, default) + + def has_key(self, key): + return self._getAttrMap().has_key(key) + + def __getitem__(self, key): + """tag[key] returns the value of the 'key' attribute for the tag, + and throws an exception if it's not there.""" + return self._getAttrMap()[key] + + def __iter__(self): + "Iterating over a tag iterates over its contents." + return iter(self.contents) + + def __len__(self): + "The length of a tag is the length of its list of contents." + return len(self.contents) + + def __contains__(self, x): + return x in self.contents + + def __nonzero__(self): + "A tag is non-None even if it has no contents." + return True + + def __setitem__(self, key, value): + """Setting tag[key] sets the value of the 'key' attribute for the + tag.""" + self._getAttrMap() + self.attrMap[key] = value + found = False + for i in range(0, len(self.attrs)): + if self.attrs[i][0] == key: + self.attrs[i] = (key, value) + found = True + if not found: + self.attrs.append((key, value)) + self._getAttrMap()[key] = value + + def __delitem__(self, key): + "Deleting tag[key] deletes all 'key' attributes for the tag." + for item in self.attrs: + if item[0] == key: + self.attrs.remove(item) + #We don't break because bad HTML can define the same + #attribute multiple times. + self._getAttrMap() + if self.attrMap.has_key(key): + del self.attrMap[key] + + def __call__(self, *args, **kwargs): + """Calling a tag like a function is the same as calling its + findAll() method. Eg. tag('a') returns a list of all the A tags + found within this tag.""" + return apply(self.findAll, args, kwargs) + + def __getattr__(self, tag): + #print "Getattr %s.%s" % (self.__class__, tag) + if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3: + return self.find(tag[:-3]) + elif tag.find('__') != 0: + return self.find(tag) + + def __eq__(self, other): + """Returns true iff this tag has the same name, the same attributes, + and the same contents (recursively) as the given tag. + + NOTE: right now this will return false if two tags have the + same attributes in a different order. Should this be fixed?""" + if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): + return False + for i in range(0, len(self.contents)): + if self.contents[i] != other.contents[i]: + return False + return True + + def __ne__(self, other): + """Returns true iff this tag is not identical to the other tag, + as defined in __eq__.""" + return not self == other + + def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): + """Renders this tag as a string.""" + return self.__str__(encoding) + + def __unicode__(self): + return self.__str__(None) + + def _convertEntities(self, match): + x = match.group(1) + if x in name2codepoint: + c = unichr(name2codepoint[x]) + if c in self.XML_ENTITIES_TO_CHARS.values(): + return '&%s;' % x + else: + return c + elif x in self.XML_ENTITIES_TO_CHARS: + return '&%s;' % x + else: + return '&%s;' % x + + def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING, + prettyPrint=False, indentLevel=0): + """Returns a string or Unicode representation of this tag and + its contents. To get Unicode, pass None for encoding. + + NOTE: since Python's HTML parser consumes whitespace, this + method is not certain to reproduce the whitespace present in + the original string.""" + + encodedName = self.toEncoding(self.name, encoding) + + attrs = [] + if self.attrs: + for key, val in self.attrs: + fmt = '%s="%s"' + if isString(val): + if self.containsSubstitutions and '%SOUP-ENCODING%' in val: + val = self.substituteEncoding(val, encoding) + + # The attribute value either: + # + # * Contains no embedded double quotes or single quotes. + # No problem: we enclose it in double quotes. + # * Contains embedded single quotes. No problem: + # double quotes work here too. + # * Contains embedded double quotes. No problem: + # we enclose it in single quotes. + # * Embeds both single _and_ double quotes. This + # can't happen naturally, but it can happen if + # you modify an attribute value after parsing + # the document. Now we have a bit of a + # problem. We solve it by enclosing the + # attribute in single quotes, and escaping any + # embedded single quotes to XML entities. + if '"' in val: + # This can't happen naturally, but it can happen + # if you modify an attribute value after parsing. + if "'" in val: + val = val.replace('"', """) + else: + fmt = "%s='%s'" + + # Optionally convert any HTML entities + if self.convertHTMLEntities: + val = re.sub("&(\w+);", self._convertEntities, val) + + # Now we're okay w/r/t quotes. But the attribute + # value might also contain angle brackets, or + # ampersands that aren't part of entities. We need + # to escape those to XML entities too. + val = val.replace("<", "<").replace(">", ">") + val = self.BARE_AMPERSAND.sub("&", val) + + + attrs.append(fmt % (self.toEncoding(key, encoding), + self.toEncoding(val, encoding))) + close = '' + closeTag = '' + if self.isSelfClosing: + close = ' /' + else: + closeTag = '' % encodedName + + indentTag, indentContents = 0, 0 + if prettyPrint: + indentTag = indentLevel + space = (' ' * (indentTag-1)) + indentContents = indentTag + 1 + contents = self.renderContents(encoding, prettyPrint, indentContents) + if self.hidden: + s = contents + else: + s = [] + attributeString = '' + if attrs: + attributeString = ' ' + ' '.join(attrs) + if prettyPrint: + s.append(space) + s.append('<%s%s%s>' % (encodedName, attributeString, close)) + if prettyPrint: + s.append("\n") + s.append(contents) + if prettyPrint and contents and contents[-1] != "\n": + s.append("\n") + if prettyPrint and closeTag: + s.append(space) + s.append(closeTag) + if prettyPrint and closeTag and self.nextSibling: + s.append("\n") + s = ''.join(s) + return s + + def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): + return self.__str__(encoding, True) + + def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, + prettyPrint=False, indentLevel=0): + """Renders the contents of this tag as a string in the given + encoding. If encoding is None, returns a Unicode string..""" + s=[] + for c in self: + text = None + if isinstance(c, NavigableString): + text = c.__str__(encoding) + elif isinstance(c, Tag): + s.append(c.__str__(encoding, prettyPrint, indentLevel)) + if text and prettyPrint: + text = text.strip() + if text: + if prettyPrint: + s.append(" " * (indentLevel-1)) + s.append(text) + if prettyPrint: + s.append("\n") + return ''.join(s) + + #Soup methods + + def find(self, name=None, attrs={}, recursive=True, text=None, + **kwargs): + """Return only the first child of this Tag matching the given + criteria.""" + r = None + l = self.findAll(name, attrs, recursive, text, 1, **kwargs) + if l: + r = l[0] + return r + findChild = find + + def findAll(self, name=None, attrs={}, recursive=True, text=None, + limit=None, **kwargs): + """Extracts a list of Tag objects that match the given + criteria. You can specify the name of the Tag and any + attributes you want the Tag to have. + + The value of a key-value pair in the 'attrs' map can be a + string, a list of strings, a regular expression object, or a + callable that takes a string and returns whether or not the + string matches for some custom definition of 'matches'. The + same is true of the tag name.""" + generator = self.recursiveChildGenerator + if not recursive: + generator = self.childGenerator + return self._findAll(name, attrs, text, limit, generator, **kwargs) + findChildren = findAll + + # Pre-3.x compatibility methods + first = find + fetch = findAll + + def fetchText(self, text=None, recursive=True, limit=None): + return self.findAll(text=text, recursive=recursive, limit=limit) + + def firstText(self, text=None, recursive=True): + return self.find(text=text, recursive=recursive) + + #Utility methods + + def append(self, tag): + """Appends the given tag to the contents of this tag.""" + self.contents.append(tag) + + #Private methods + + def _getAttrMap(self): + """Initializes a map representation of this tag's attributes, + if not already initialized.""" + if not getattr(self, 'attrMap'): + self.attrMap = {} + for (key, value) in self.attrs: + self.attrMap[key] = value + return self.attrMap + + #Generator methods + def childGenerator(self): + for i in range(0, len(self.contents)): + yield self.contents[i] + raise StopIteration + + def recursiveChildGenerator(self): + stack = [(self, 0)] + while stack: + tag, start = stack.pop() + if isinstance(tag, Tag): + for i in range(start, len(tag.contents)): + a = tag.contents[i] + yield a + if isinstance(a, Tag) and tag.contents: + if i < len(tag.contents) - 1: + stack.append((tag, i+1)) + stack.append((a, 0)) + break + raise StopIteration + +# Next, a couple classes to represent queries and their results. +class SoupStrainer: + """Encapsulates a number of ways of matching a markup element (tag or + text).""" + + def __init__(self, name=None, attrs={}, text=None, **kwargs): + self.name = name + if isString(attrs): + kwargs['class'] = attrs + attrs = None + if kwargs: + if attrs: + attrs = attrs.copy() + attrs.update(kwargs) + else: + attrs = kwargs + self.attrs = attrs + self.text = text + + def __str__(self): + if self.text: + return self.text + else: + return "%s|%s" % (self.name, self.attrs) + + def searchTag(self, markupName=None, markupAttrs={}): + found = None + markup = None + if isinstance(markupName, Tag): + markup = markupName + markupAttrs = markup + callFunctionWithTagData = callable(self.name) \ + and not isinstance(markupName, Tag) + + if (not self.name) \ + or callFunctionWithTagData \ + or (markup and self._matches(markup, self.name)) \ + or (not markup and self._matches(markupName, self.name)): + if callFunctionWithTagData: + match = self.name(markupName, markupAttrs) + else: + match = True + markupAttrMap = None + for attr, matchAgainst in self.attrs.items(): + if not markupAttrMap: + if hasattr(markupAttrs, 'get'): + markupAttrMap = markupAttrs + else: + markupAttrMap = {} + for k,v in markupAttrs: + markupAttrMap[k] = v + attrValue = markupAttrMap.get(attr) + if not self._matches(attrValue, matchAgainst): + match = False + break + if match: + if markup: + found = markup + else: + found = markupName + return found + + def search(self, markup): + #print 'looking for %s in %s' % (self, markup) + found = None + # If given a list of items, scan it for a text element that + # matches. + if isList(markup) and not isinstance(markup, Tag): + for element in markup: + if isinstance(element, NavigableString) \ + and self.search(element): + found = element + break + # If it's a Tag, make sure its name or attributes match. + # Don't bother with Tags if we're searching for text. + elif isinstance(markup, Tag): + if not self.text: + found = self.searchTag(markup) + # If it's text, make sure the text matches. + elif isinstance(markup, NavigableString) or \ + isString(markup): + if self._matches(markup, self.text): + found = markup + else: + raise Exception, "I don't know how to match against a %s" \ + % markup.__class__ + return found + + def _matches(self, markup, matchAgainst): + #print "Matching %s against %s" % (markup, matchAgainst) + result = False + if matchAgainst == True and type(matchAgainst) == types.BooleanType: + result = markup != None + elif callable(matchAgainst): + result = matchAgainst(markup) + else: + #Custom match methods take the tag as an argument, but all + #other ways of matching match the tag name as a string. + if isinstance(markup, Tag): + markup = markup.name + if markup and not isString(markup): + markup = unicode(markup) + #Now we know that chunk is either a string, or None. + if hasattr(matchAgainst, 'match'): + # It's a regexp object. + result = markup and matchAgainst.search(markup) + elif isList(matchAgainst): + result = markup in matchAgainst + elif hasattr(matchAgainst, 'items'): + result = markup.has_key(matchAgainst) + elif matchAgainst and isString(markup): + if isinstance(markup, unicode): + matchAgainst = unicode(matchAgainst) + else: + matchAgainst = str(matchAgainst) + + if not result: + result = matchAgainst == markup + return result + +class ResultSet(list): + """A ResultSet is just a list that keeps track of the SoupStrainer + that created it.""" + def __init__(self, source): + list.__init__([]) + self.source = source + +# Now, some helper functions. + +def isList(l): + """Convenience method that works with all 2.x versions of Python + to determine whether or not something is listlike.""" + return hasattr(l, '__iter__') \ + or (type(l) in (types.ListType, types.TupleType)) + +def isString(s): + """Convenience method that works with all 2.x versions of Python + to determine whether or not something is stringlike.""" + try: + return isinstance(s, unicode) or isintance(s, basestring) + except NameError: + return isinstance(s, str) + +def buildTagMap(default, *args): + """Turns a list of maps, lists, or scalars into a single map. + Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and + NESTING_RESET_TAGS maps out of lists and partial maps.""" + built = {} + for portion in args: + if hasattr(portion, 'items'): + #It's a map. Merge it. + for k,v in portion.items(): + built[k] = v + elif isList(portion): + #It's a list. Map each item to the default. + for k in portion: + built[k] = default + else: + #It's a scalar. Map it to the default. + built[portion] = default + return built + +# Now, the parser classes. + +class BeautifulStoneSoup(Tag, SGMLParser): + + """This class contains the basic parser and search code. It defines + a parser that knows nothing about tag behavior except for the + following: + + You can't close a tag without closing all the tags it encloses. + That is, "" actually means + "". + + [Another possible explanation is "", but since + this class defines no SELF_CLOSING_TAGS, it will never use that + explanation.] + + This class is useful for parsing XML or made-up markup languages, + or when BeautifulSoup makes an assumption counter to what you were + expecting.""" + + SELF_CLOSING_TAGS = {} + NESTABLE_TAGS = {} + RESET_NESTING_TAGS = {} + QUOTE_TAGS = {} + + MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'), + lambda x: x.group(1) + ' />'), + (re.compile(']*)>'), + lambda x: '') + ] + + ROOT_TAG_NAME = u'[document]' + + HTML_ENTITIES = "html" + XML_ENTITIES = "xml" + ALL_ENTITIES = [HTML_ENTITIES, XML_ENTITIES] + + def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None, + markupMassage=True, smartQuotesTo=XML_ENTITIES, + convertEntities=None, selfClosingTags=None): + """The Soup object is initialized as the 'root tag', and the + provided markup (which can be a string or a file-like object) + is fed into the underlying parser. + + sgmllib will process most bad HTML, and the BeautifulSoup + class has some tricks for dealing with some HTML that kills + sgmllib, but Beautiful Soup can nonetheless choke or lose data + if your data uses self-closing tags or declarations + incorrectly. + + By default, Beautiful Soup uses regexes to sanitize input, + avoiding the vast majority of these problems. If the problems + don't apply to you, pass in False for markupMassage, and + you'll get better performance. + + The default parser massage techniques fix the two most common + instances of invalid HTML that choke sgmllib: + +
(No space between name of closing tag and tag close) + (Extraneous whitespace in declaration) + + You can pass in a custom list of (RE object, replace method) + tuples to get Beautiful Soup to scrub your input the way you + want.""" + + self.parseOnlyThese = parseOnlyThese + self.fromEncoding = fromEncoding + self.smartQuotesTo = smartQuotesTo + + if convertEntities: + # It doesn't make sense to convert encoded characters to + # entities even while you're converting entities to Unicode. + # Just convert it all to Unicode. + self.smartQuotesTo = None + + if isList(convertEntities): + self.convertHTMLEntities = self.HTML_ENTITIES in convertEntities + self.convertXMLEntities = self.XML_ENTITIES in convertEntities + else: + self.convertHTMLEntities = self.HTML_ENTITIES == convertEntities + self.convertXMLEntities = self.XML_ENTITIES == convertEntities + + self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags) + SGMLParser.__init__(self) + + if hasattr(markup, 'read'): # It's a file-type object. + markup = markup.read() + self.markup = markup + self.markupMassage = markupMassage + try: + self._feed() + except StopParsing: + pass + self.markup = None # The markup can now be GCed + + def _feed(self, inDocumentEncoding=None): + # Convert the document to Unicode. + markup = self.markup + if isinstance(markup, unicode): + if not hasattr(self, 'originalEncoding'): + self.originalEncoding = None + else: + dammit = UnicodeDammit\ + (markup, [self.fromEncoding, inDocumentEncoding], + smartQuotesTo=self.smartQuotesTo) + markup = dammit.unicode + self.originalEncoding = dammit.originalEncoding + if markup: + if self.markupMassage: + if not isList(self.markupMassage): + self.markupMassage = self.MARKUP_MASSAGE + for fix, m in self.markupMassage: + markup = fix.sub(m, markup) + self.reset() + + SGMLParser.feed(self, markup or "") + SGMLParser.close(self) + # Close out any unfinished strings and close all the open tags. + self.endData() + while self.currentTag.name != self.ROOT_TAG_NAME: + self.popTag() + + def __getattr__(self, methodName): + """This method routes method call requests to either the SGMLParser + superclass or the Tag superclass, depending on the method name.""" + #print "__getattr__ called on %s.%s" % (self.__class__, methodName) + + if methodName.find('start_') == 0 or methodName.find('end_') == 0 \ + or methodName.find('do_') == 0: + return SGMLParser.__getattr__(self, methodName) + elif methodName.find('__') != 0: + return Tag.__getattr__(self, methodName) + else: + raise AttributeError + + def isSelfClosingTag(self, name): + """Returns true iff the given string is the name of a + self-closing tag according to this parser.""" + return self.SELF_CLOSING_TAGS.has_key(name) \ + or self.instanceSelfClosingTags.has_key(name) + + def reset(self): + Tag.__init__(self, self, self.ROOT_TAG_NAME) + self.hidden = 1 + SGMLParser.reset(self) + self.currentData = [] + self.currentTag = None + self.tagStack = [] + self.quoteStack = [] + self.pushTag(self) + + def popTag(self): + tag = self.tagStack.pop() + # Tags with just one string-owning child get the child as a + # 'string' property, so that soup.tag.string is shorthand for + # soup.tag.contents[0] + if len(self.currentTag.contents) == 1 and \ + isinstance(self.currentTag.contents[0], NavigableString): + self.currentTag.string = self.currentTag.contents[0] + + #print "Pop", tag.name + if self.tagStack: + self.currentTag = self.tagStack[-1] + return self.currentTag + + def pushTag(self, tag): + #print "Push", tag.name + if self.currentTag: + self.currentTag.append(tag) + self.tagStack.append(tag) + self.currentTag = self.tagStack[-1] + + def endData(self, containerClass=NavigableString): + if self.currentData: + currentData = ''.join(self.currentData) + if currentData.endswith('<') and self.convertHTMLEntities: + currentData = currentData[:-1] + '<' + if not currentData.strip(): + if '\n' in currentData: + currentData = '\n' + else: + currentData = ' ' + self.currentData = [] + if self.parseOnlyThese and len(self.tagStack) <= 1 and \ + (not self.parseOnlyThese.text or \ + not self.parseOnlyThese.search(currentData)): + return + o = containerClass(currentData) + o.setup(self.currentTag, self.previous) + if self.previous: + self.previous.next = o + self.previous = o + self.currentTag.contents.append(o) + + + def _popToTag(self, name, inclusivePop=True): + """Pops the tag stack up to and including the most recent + instance of the given tag. If inclusivePop is false, pops the tag + stack up to but *not* including the most recent instqance of + the given tag.""" + #print "Popping to %s" % name + if name == self.ROOT_TAG_NAME: + return + + numPops = 0 + mostRecentTag = None + for i in range(len(self.tagStack)-1, 0, -1): + if name == self.tagStack[i].name: + numPops = len(self.tagStack)-i + break + if not inclusivePop: + numPops = numPops - 1 + + for i in range(0, numPops): + mostRecentTag = self.popTag() + return mostRecentTag + + def _smartPop(self, name): + + """We need to pop up to the previous tag of this type, unless + one of this tag's nesting reset triggers comes between this + tag and the previous tag of this type, OR unless this tag is a + generic nesting trigger and another generic nesting trigger + comes between this tag and the previous tag of this type. + + Examples: +

FooBar

should pop to 'p', not 'b'. +

FooBar

should pop to 'table', not 'p'. +

Foo

Bar

should pop to 'tr', not 'p'. +

FooBar

should pop to 'p', not 'b'. + +

    • *
    • * should pop to 'ul', not the first 'li'. +
  • ** should pop to 'table', not the first 'tr' + tag should + implicitly close the previous tag within the same
    ** should pop to 'tr', not the first 'td' + """ + + nestingResetTriggers = self.NESTABLE_TAGS.get(name) + isNestable = nestingResetTriggers != None + isResetNesting = self.RESET_NESTING_TAGS.has_key(name) + popTo = None + inclusive = True + for i in range(len(self.tagStack)-1, 0, -1): + p = self.tagStack[i] + if (not p or p.name == name) and not isNestable: + #Non-nestable tags get popped to the top or to their + #last occurance. + popTo = name + break + if (nestingResetTriggers != None + and p.name in nestingResetTriggers) \ + or (nestingResetTriggers == None and isResetNesting + and self.RESET_NESTING_TAGS.has_key(p.name)): + + #If we encounter one of the nesting reset triggers + #peculiar to this tag, or we encounter another tag + #that causes nesting to reset, pop up to but not + #including that tag. + popTo = p.name + inclusive = False + break + p = p.parent + if popTo: + self._popToTag(popTo, inclusive) + + def unknown_starttag(self, name, attrs, selfClosing=0): + #print "Start tag %s: %s" % (name, attrs) + if self.quoteStack: + #This is not a real tag. + #print "<%s> is not real!" % name + attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs)) + self.currentData.append('<%s%s>' % (name, attrs)) + return + self.endData() + + if not self.isSelfClosingTag(name) and not selfClosing: + self._smartPop(name) + + if self.parseOnlyThese and len(self.tagStack) <= 1 \ + and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)): + return + + tag = Tag(self, name, attrs, self.currentTag, self.previous) + if self.previous: + self.previous.next = tag + self.previous = tag + self.pushTag(tag) + if selfClosing or self.isSelfClosingTag(name): + self.popTag() + if name in self.QUOTE_TAGS: + #print "Beginning quote (%s)" % name + self.quoteStack.append(name) + self.literal = 1 + return tag + + def unknown_endtag(self, name): + #print "End tag %s" % name + if self.quoteStack and self.quoteStack[-1] != name: + #This is not a real end tag. + #print " is not real!" % name + self.currentData.append('' % name) + return + self.endData() + self._popToTag(name) + if self.quoteStack and self.quoteStack[-1] == name: + self.quoteStack.pop() + self.literal = (len(self.quoteStack) > 0) + + def handle_data(self, data): + if self.convertHTMLEntities: + if data[0] == '&': + data = self.BARE_AMPERSAND.sub("&",data) + else: + data = data.replace('&','&') \ + .replace('<','<') \ + .replace('>','>') + self.currentData.append(data) + + def _toStringSubclass(self, text, subclass): + """Adds a certain piece of text to the tree as a NavigableString + subclass.""" + self.endData() + self.handle_data(text) + self.endData(subclass) + + def handle_pi(self, text): + """Handle a processing instruction as a ProcessingInstruction + object, possibly one with a %SOUP-ENCODING% slot into which an + encoding will be plugged later.""" + if text[:3] == "xml": + text = "xml version='1.0' encoding='%SOUP-ENCODING%'" + self._toStringSubclass(text, ProcessingInstruction) + + def handle_comment(self, text): + "Handle comments as Comment objects." + self._toStringSubclass(text, Comment) + + def handle_charref(self, ref): + "Handle character references as data." + if ref[0] == 'x': + data = unichr(int(ref[1:],16)) + else: + data = unichr(int(ref)) + + if u'\x80' <= data <= u'\x9F': + data = UnicodeDammit.subMSChar(chr(ord(data)), self.smartQuotesTo) + elif not self.convertHTMLEntities and not self.convertXMLEntities: + data = '&#%s;' % ref + + self.handle_data(data) + + def handle_entityref(self, ref): + """Handle entity references as data, possibly converting known + HTML entity references to the corresponding Unicode + characters.""" + replaceWithXMLEntity = self.convertXMLEntities and \ + self.XML_ENTITIES_TO_CHARS.has_key(ref) + if self.convertHTMLEntities or replaceWithXMLEntity: + try: + data = unichr(name2codepoint[ref]) + except KeyError: + if replaceWithXMLEntity: + data = self.XML_ENTITIES_TO_CHARS.get(ref) + else: + data="&%s" % ref + else: + data = '&%s;' % ref + self.handle_data(data) + + def handle_decl(self, data): + "Handle DOCTYPEs and the like as Declaration objects." + self._toStringSubclass(data, Declaration) + + def parse_declaration(self, i): + """Treat a bogus SGML declaration as raw data. Treat a CDATA + declaration as a CData object.""" + j = None + if self.rawdata[i:i+9] == '', i) + if k == -1: + k = len(self.rawdata) + data = self.rawdata[i+9:k] + j = k+3 + self._toStringSubclass(data, CData) + else: + try: + j = SGMLParser.parse_declaration(self, i) + except SGMLParseError: + toHandle = self.rawdata[i:] + self.handle_data(toHandle) + j = i + len(toHandle) + return j + +class BeautifulSoup(BeautifulStoneSoup): + + """This parser knows the following facts about HTML: + + * Some tags have no closing tag and should be interpreted as being + closed as soon as they are encountered. + + * The text inside some tags (ie. 'script') may contain tags which + are not really part of the document and which should be parsed + as text, not tags. If you want to parse the text as tags, you can + always fetch it and parse it explicitly. + + * Tag nesting rules: + + Most tags can't be nested at all. For instance, the occurance of + a

    tag should implicitly close the previous

    tag. + +

    Para1

    Para2 + should be transformed into: +

    Para1

    Para2 + + Some tags can be nested arbitrarily. For instance, the occurance + of a

    tag should _not_ implicitly close the previous +
    tag. + + Alice said:
    Bob said:
    Blah + should NOT be transformed into: + Alice said:
    Bob said:
    Blah + + Some tags can be nested, but the nesting is reset by the + interposition of other tags. For instance, a
    , + but not close a tag in another table. + +
    BlahBlah + should be transformed into: +
    BlahBlah + but, + Blah
    Blah + should NOT be transformed into + Blah
    Blah + + Differing assumptions about tag nesting rules are a major source + of problems with the BeautifulSoup class. If BeautifulSoup is not + treating as nestable a tag your page author treats as nestable, + try ICantBelieveItsBeautifulSoup, MinimalSoup, or + BeautifulStoneSoup before writing your own subclass.""" + + def __init__(self, *args, **kwargs): + if not kwargs.has_key('smartQuotesTo'): + kwargs['smartQuotesTo'] = self.HTML_ENTITIES + BeautifulStoneSoup.__init__(self, *args, **kwargs) + + SELF_CLOSING_TAGS = buildTagMap(None, + ['br' , 'hr', 'input', 'img', 'meta', + 'spacer', 'link', 'frame', 'base']) + + QUOTE_TAGS = {'script': None} + + #According to the HTML standard, each of these inline tags can + #contain another tag of the same type. Furthermore, it's common + #to actually use these tags this way. + NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', + 'center'] + + #According to the HTML standard, these block tags can contain + #another tag of the same type. Furthermore, it's common + #to actually use these tags this way. + NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del'] + + #Lists can contain other lists, but there are restrictions. + NESTABLE_LIST_TAGS = { 'ol' : [], + 'ul' : [], + 'li' : ['ul', 'ol'], + 'dl' : [], + 'dd' : ['dl'], + 'dt' : ['dl'] } + + #Tables can contain other tables, but there are restrictions. + NESTABLE_TABLE_TAGS = {'table' : [], + 'tr' : ['table', 'tbody', 'tfoot', 'thead'], + 'td' : ['tr'], + 'th' : ['tr'], + 'thead' : ['table'], + 'tbody' : ['table'], + 'tfoot' : ['table'], + } + + NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre'] + + #If one of these tags is encountered, all tags up to the next tag of + #this type are popped. + RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript', + NON_NESTABLE_BLOCK_TAGS, + NESTABLE_LIST_TAGS, + NESTABLE_TABLE_TAGS) + + NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS, + NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS) + + # Used to detect the charset in a META tag; see start_meta + CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)") + + def start_meta(self, attrs): + """Beautiful Soup can detect a charset included in a META tag, + try to convert the document to that charset, and re-parse the + document from the beginning.""" + httpEquiv = None + contentType = None + contentTypeIndex = None + tagNeedsEncodingSubstitution = False + + for i in range(0, len(attrs)): + key, value = attrs[i] + key = key.lower() + if key == 'http-equiv': + httpEquiv = value + elif key == 'content': + contentType = value + contentTypeIndex = i + + if httpEquiv and contentType: # It's an interesting meta tag. + match = self.CHARSET_RE.search(contentType) + if match: + if getattr(self, 'declaredHTMLEncoding') or \ + (self.originalEncoding == self.fromEncoding): + # This is our second pass through the document, or + # else an encoding was specified explicitly and it + # worked. Rewrite the meta tag. + newAttr = self.CHARSET_RE.sub\ + (lambda(match):match.group(1) + + "%SOUP-ENCODING%", value) + attrs[contentTypeIndex] = (attrs[contentTypeIndex][0], + newAttr) + tagNeedsEncodingSubstitution = True + else: + # This is our first pass through the document. + # Go through it again with the new information. + newCharset = match.group(3) + if newCharset and newCharset != self.originalEncoding: + self.declaredHTMLEncoding = newCharset + self._feed(self.declaredHTMLEncoding) + raise StopParsing + tag = self.unknown_starttag("meta", attrs) + if tag and tagNeedsEncodingSubstitution: + tag.containsSubstitutions = True + +class StopParsing(Exception): + pass + +class ICantBelieveItsBeautifulSoup(BeautifulSoup): + + """The BeautifulSoup class is oriented towards skipping over + common HTML errors like unclosed tags. However, sometimes it makes + errors of its own. For instance, consider this fragment: + + FooBar + + This is perfectly valid (if bizarre) HTML. However, the + BeautifulSoup class will implicitly close the first b tag when it + encounters the second 'b'. It will think the author wrote + "FooBar", and didn't close the first 'b' tag, because + there's no real-world reason to bold something that's already + bold. When it encounters '' it will close two more 'b' + tags, for a grand total of three tags closed instead of two. This + can throw off the rest of your document structure. The same is + true of a number of other tags, listed below. + + It's much more common for someone to forget to close a 'b' tag + than to actually use nested 'b' tags, and the BeautifulSoup class + handles the common case. This class handles the not-co-common + case: where you can't believe someone wrote what they did, but + it's valid HTML and BeautifulSoup screwed up by assuming it + wouldn't be.""" + + I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \ + ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', + 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', + 'big'] + + I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript'] + + NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS, + I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS, + I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS) + +class MinimalSoup(BeautifulSoup): + """The MinimalSoup class is for parsing HTML that contains + pathologically bad markup. It makes no assumptions about tag + nesting, but it does know which tags are self-closing, that +