Commit 9e2b5acc authored by whzecomjm's avatar whzecomjm
Browse files

init python instances

parents
Loading
Loading
Loading
Loading

Douban/douban.ipynb

0 → 100644
+169 −0
Original line number Diff line number Diff line
%% Cell type:markdown id: tags:

# 提取豆瓣电影Top250

%% Cell type:code id: tags:

``` python
#coding:utf-8
# 文件导出版本
'''''
@author: jsjxy
@origin: https://www.cnblogs.com/xisheng/p/9130156.html
'''
from urllib import request
from bs4 import BeautifulSoup
import re
#from distutils.filelist import findall

def doubantop(N):
    with open('./douban-top-%s.txt'%str(N),'w') as f:
        f.write("豆瓣电影TOP"+str(N)+'\n')
        f.write('%-13s%-5s%-15s%-40s\n'%('影片名','评分','评价人数','链接'))
        for n in range(N//25):
            url = 'https://movie.douban.com/top250?start='+str(25*n)+'&filter='
            page = request.urlopen(url)
            contents = page.read()
         #print(contents)
            soup = BeautifulSoup(contents,"html.parser")

            for tag in soup.find_all('div', class_='info'):
               # print tag
                m_name = tag.find('span', class_='title').get_text()
                m_rating_score = float(tag.find('span',class_='rating_num').get_text())
                m_people = tag.find('div',class_="star")
                m_span = m_people.findAll('span')
                m_peoplecount = re.sub(r'[^0-9]','',m_span[3].contents[0])
                # 剔除了`人评价`字样, 只保留人数数字
                m_url=tag.find('a').get('href')
                f.write('%s\t%s\t%s\t%s\n'% (m_name,str(m_rating_score), m_peoplecount ,m_url) )
            n=n+1
    print('请查看文件douban-top-%s!' %str(N))

num = input("你想要看豆瓣电影TOP榜单的前几名(小于等于250且最好是25的倍数):")
num = int(num)
doubantop(num)
```

%% Output

    你想要看豆瓣电影TOP榜单的前几名(小于等于250且最好是25的倍数):100
    请查看文件douban-top-100!

%% Cell type:code id: tags:

``` python
# print 查看版本
#coding:utf-8
'''''
@author: jsjxy
@origin: https://www.cnblogs.com/xisheng/p/9130156.html
'''
from urllib import request
from bs4 import BeautifulSoup
import re
#from distutils.filelist import findall

all_list = []
for n in range(10):
    url = 'https://movie.douban.com/top250?start='+str(25*n)+'&filter='
    page = request.urlopen(url)
    contents = page.read()
 #print(contents)
    soup = BeautifulSoup(contents,"html.parser")
    for tag in soup.find_all('div', class_='info'):
       # print tag
        m_name = tag.find('span', class_='title').get_text()
        m_rating_score = float(tag.find('span',class_='rating_num').get_text())
        m_people = tag.find('div',class_="star")
        m_span = m_people.findAll('span')
        m_peoplecount = re.sub(r'[^0-9]','',m_span[3].contents[0])
        # 剔除了`人评价`字样, 只保留人数数字
        m_url=tag.find('a').get('href')
        all_list.append('%s\t%s\t%s\t%s\n'% (m_name,str(m_rating_score), m_peoplecount ,m_url))



def doubantop(N,n=1):
    print("豆瓣电影TOP "+str(n)+' - '+str(N)+'\n'+'%-13s%-5s%-15s%-40s\n'%('影片名','评分','评价人数','链接'))
    for movie in all_list[n-1:N]:
        print(movie)



#num = input("你想要看豆瓣电影TOP250榜单的范围")
#num = int(num)
doubantop(30)

```

%% Output

    豆瓣电影TOP 1 - 30
    影片名          评分   评价人数           链接
    
    肖申克的救赎	9.6	1288574	https://movie.douban.com/subject/1292052/
    
    霸王别姬	9.6	951218	https://movie.douban.com/subject/1291546/
    
    这个杀手不太冷	9.4	1182952	https://movie.douban.com/subject/1295644/
    
    阿甘正传	9.4	1015734	https://movie.douban.com/subject/1292720/
    
    美丽人生	9.5	594513	https://movie.douban.com/subject/1292063/
    
    泰坦尼克号	9.3	951937	https://movie.douban.com/subject/1292722/
    
    千与千寻	9.3	944451	https://movie.douban.com/subject/1291561/
    
    辛德勒的名单	9.5	532059	https://movie.douban.com/subject/1295124/
    
    盗梦空间	9.3	1026845	https://movie.douban.com/subject/3541415/
    
    机器人总动员	9.3	683063	https://movie.douban.com/subject/2131459/
    
    忠犬八公的故事	9.3	671461	https://movie.douban.com/subject/3011091/
    
    三傻大闹宝莱坞	9.2	921705	https://movie.douban.com/subject/3793023/
    
    海上钢琴师	9.2	765462	https://movie.douban.com/subject/1292001/
    
    放牛班的春天	9.3	635757	https://movie.douban.com/subject/1291549/
    
    大话西游之大圣娶亲	9.2	707374	https://movie.douban.com/subject/1292213/
    
    楚门的世界	9.2	691053	https://movie.douban.com/subject/1292064/
    
    龙猫	9.2	623627	https://movie.douban.com/subject/1291560/
    
    星际穿越	9.2	701880	https://movie.douban.com/subject/1889243/
    
    教父	9.2	465133	https://movie.douban.com/subject/1291841/
    
    熔炉	9.3	403040	https://movie.douban.com/subject/5912992/
    
    无间道	9.1	581480	https://movie.douban.com/subject/1307914/
    
    当幸福来敲门	9.0	745379	https://movie.douban.com/subject/1849031/
    
    疯狂动物城	9.2	781539	https://movie.douban.com/subject/25662329/
    
    触不可及	9.2	491584	https://movie.douban.com/subject/6786002/
    
    怦然心动	9.0	810776	https://movie.douban.com/subject/3319755/
    
    乱世佳人	9.2	349486	https://movie.douban.com/subject/1300267/
    
    蝙蝠侠:黑暗骑士	9.1	469114	https://movie.douban.com/subject/1851857/
    
    活着	9.2	382700	https://movie.douban.com/subject/1292365/
    
    天堂电影院	9.1	380814	https://movie.douban.com/subject/1291828/
    
    少年派的奇幻漂流	9.0	755425	https://movie.douban.com/subject/1929463/
    

%% Cell type:code id: tags:

``` python
```

Douban/douban.py

0 → 100644
+92 −0
Original line number Diff line number Diff line
# -*- coding: utf-8 -*-
"""douban.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1qg0hwEsZ4IjUxPMAlKT0DwKyUgnjc6Zx

# 提取豆瓣电影Top250
"""

#coding:utf-8  
# 文件导出版本
''''' 
@author: jsjxy 
@origin: https://www.cnblogs.com/xisheng/p/9130156.html
'''  
from urllib import request 
from bs4 import BeautifulSoup  
import re
#from distutils.filelist import findall   

def doubantop(N):
    with open('./douban-top-%s.txt'%str(N),'w') as f:
        f.write("豆瓣电影TOP"+str(N)+'\n') 
        f.write('%-13s%-5s%-15s%-40s\n'%('影片名','评分','评价人数','链接'))
        for n in range(N//25):
            url = 'https://movie.douban.com/top250?start='+str(25*n)+'&filter='  
            page = request.urlopen(url)   
            contents = page.read()   
         #print(contents)  
            soup = BeautifulSoup(contents,"html.parser")  

            for tag in soup.find_all('div', class_='info'):    
               # print tag  
                m_name = tag.find('span', class_='title').get_text()        
                m_rating_score = float(tag.find('span',class_='rating_num').get_text())          
                m_people = tag.find('div',class_="star")  
                m_span = m_people.findAll('span')  
                m_peoplecount = re.sub(r'[^0-9]','',m_span[3].contents[0])
                # 剔除了`人评价`字样, 只保留人数数字
                m_url=tag.find('a').get('href')  
                f.write('%s\t%s\t%s\t%s\n'% (m_name,str(m_rating_score), m_peoplecount ,m_url) )
            n=n+1
    print('请查看文件douban-top-%s!' %str(N))

num = input("你想要看豆瓣电影TOP榜单的前几名(小于等于250且最好是25的倍数):")
num = int(num)
doubantop(num)

# print 查看版本
#coding:utf-8  
''''' 
@author: jsjxy 
@origin: https://www.cnblogs.com/xisheng/p/9130156.html
'''  
from urllib import request 
from bs4 import BeautifulSoup  
import re
#from distutils.filelist import findall   

all_list = []
for n in range(10):
    url = 'https://movie.douban.com/top250?start='+str(25*n)+'&filter='  
    page = request.urlopen(url)   
    contents = page.read()   
 #print(contents)  
    soup = BeautifulSoup(contents,"html.parser")  
    for tag in soup.find_all('div', class_='info'):    
       # print tag  
        m_name = tag.find('span', class_='title').get_text()        
        m_rating_score = float(tag.find('span',class_='rating_num').get_text())          
        m_people = tag.find('div',class_="star")  
        m_span = m_people.findAll('span')  
        m_peoplecount = re.sub(r'[^0-9]','',m_span[3].contents[0])
        # 剔除了`人评价`字样, 只保留人数数字
        m_url=tag.find('a').get('href')  
        all_list.append('%s\t%s\t%s\t%s\n'% (m_name,str(m_rating_score), m_peoplecount ,m_url)) 



def doubantop(N,n=1):
    print("豆瓣电影TOP "+str(n)+' - '+str(N)+'\n'+'%-13s%-5s%-15s%-40s\n'%('影片名','评分','评价人数','链接'))
    for movie in all_list[n-1:N]:
        print(movie)
        


#num = input("你想要看豆瓣电影TOP250榜单的范围")
#num = int(num)
doubantop(30)
+76 −0
Original line number Diff line number Diff line
%% Cell type:code id: tags:

``` python
from urllib import request
from bs4 import BeautifulSoup
import re

url=r'http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/index.html'
link=r'http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/'

# proxy={'http':'http://localhost:80'}
headers = ("User-Agent"," Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36")  #这里模拟浏览器
opener = request.build_opener()
opener.addheaders = [headers]
request.install_opener(opener)
# 添加 header 模拟浏览器, 可兼容 urlretrieve.

contents = request.urlopen(url).read().decode()
soup = BeautifulSoup(contents,"html.parser")
n=1
for tag in soup.find_all('a'):
    pdf = tag.get('href')
    pdfurl = link+pdf
    print(pdfurl+"\n")
    pdfdir = 'C:/Users/whzec/Desktop/'+pdf
    request.urlretrieve(pdfurl,pdfdir)
    n=n+1

# urlretrieve 用来保存文件, py3 在 urllib.request内, py2在urllib
```

%% Output

    http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/syll.pdf
    
    http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/hw1.pdf
    
    http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/hw2.pdf
    
    http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/hw2a.pdf
    
    http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/hw3.pdf
    
    http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/hw3a.pdf
    
    http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/hw4.pdf
    
    http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/Lec1.pdf
    
    http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/Lec2.pdf
    
    http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/Lec3.pdf
    
    http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/Lec4.pdf
    
    http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/Lec5.pdf
    
    http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/Lec6.pdf
    
    http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/Lec7.pdf
    
    http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/Lec8.pdf
    
    http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/Lec9.pdf
    
    http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/Lec10.pdf
    
    http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/Lec11.pdf
    
    http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/Bernotes.pdf
    

%% Cell type:code id: tags:

``` python
```
+36 −0
Original line number Diff line number Diff line
# -*- coding: utf-8 -*-
"""LectureNoteDownload.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1aE4KuvnzlfpXyd1fefTsj_jqoHnNkqxy
"""

from urllib import request
from bs4 import BeautifulSoup
import re

url=r'http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/index.html'
link=r'http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/'

# proxy={'http':'http://localhost:80'}
headers = ("User-Agent"," Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36")  #这里模拟浏览器  
opener = request.build_opener()  
opener.addheaders = [headers]
request.install_opener(opener)
# 添加 header 模拟浏览器, 可兼容 urlretrieve.

contents = request.urlopen(url).read().decode()
soup = BeautifulSoup(contents,"html.parser")
n=1
for tag in soup.find_all('a'): 
    pdf = tag.get('href')
    pdfurl = link+pdf
    print(pdfurl+"\n")
    pdfdir = 'C:/Users/whzec/Desktop/'+pdf
    request.urlretrieve(pdfurl,pdfdir)
    n=n+1

# urlretrieve 用来保存文件, py3 在 urllib.request内, py2在urllib
+25 −0
Original line number Diff line number Diff line
%% Cell type:markdown id: tags:

# 文件的读取与写入

首先我们给出一个非常有用的例子, 即文件的替换.

%% Cell type:code id: tags:

``` python
# replace some words by python
import re
f=open(r'C:\Users\whzec\Desktop\1.md','r') #加上r是为了能使用反斜杠的地址,否则使用斜杠,或者双反斜杠.
alllines=f.readlines()
f.close()
f=open(r'C:\Users\whzec\Desktop\1.md','w+')
for eachline in alllines:
    a=re.sub('List','cctvcctv',eachline)
    f.writelines(a)
f.close()
```

%% Cell type:code id: tags:

``` python
```