Python 抓取电视猫官网数据制作 EPG (二)

1. 封装获取单个节目的epg成函数,保存成文件tvmao.py

#!/usr/bin/python
# coding: utf-8
# by 黑鸟博客
 
import urllib3
import requests
import datetime
import time
import base64
import ssl
import json
#import http.cookiejar
from bs4 import BeautifulSoup  
 
 
def is_valid_date(strdate):   
try:  
if ":" in strdate:  
time.strptime(strdate, "%H:%M")  
else:  
return False  
return True  
except:  
return False  

 
def sub_req(a, q, id):
 
_keyStr = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=";


str1 = "|"+q;
v = base64.b64encode(str1.encode('utf-8'));

str2 = id+"|"+a;
w = base64.b64encode(str2.encode('utf-8'));

str3 = time.strftime("%w");
wday = (7 if(int(str3) == 0) else int(str3));
#print(wday);
F = _keyStr[wday*wday];


return (F+str(w,'utf-8')+str(v,'utf-8'));

def get_program_info(link, sublink, week_day, epg_file_name):

with open(epg_file_name, "a+") as f:

str3 = time.strftime("%Y/%m/%d %A",time.localtime(time.time()+(week_day-int(time.strftime("%w")))*24*3600) );
#str3 = datetime.date.today()+datetime.timedelta(days = (1-int(time.strftime("%w"))))

f.write(str3)
f.write("\n\n")

f.close()
 
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0',
'Connection' : 'keep-alive', 'Cache-Control': 'no-cache'} 
website = '%s%s' % (link, sublink)
r = requests.get(website, headers= headers)

soup = BeautifulSoup(r.text, 'lxml')   
# 获取节目列表,https://www.guihet.com/
list_program_div = soup.find(name='div',attrs={"class":"epg"}).find_all(name='span');
with open(epg_file_name, "a+") as f:
for tagprogram in list_program_div:
#print(tagprogram)
try:
if is_valid_date(tagprogram.text):

f.write(tagprogram.text)
f.write("")
else:
if tagprogram.text != '正在播出':
f.write(tagprogram.text)
f.write("\n")

except:
continue
f.close()

list_first_form = soup.find(name='form');
sublink = "/api/pg?p="+sub_req(list_first_form["a"], list_first_form["q"], list_first_form.button["id"]);
 
website = '%s%s' % (link, sublink);
sub_r = requests.get(website);
 
soup = BeautifulSoup(sub_r.json()[1], 'lxml')   
list_program_div = soup.find_all(name='span');
 
with open(epg_file_name, "a+") as f:
for tagprogram in list_program_div:

try:
if is_valid_date(tagprogram.text):
f.write(tagprogram.text)
f.write("")
else:
if tagprogram.text != '正在播出':
f.write(tagprogram.text)
f.write("\n")

except:
continue
f.write("\n\n")
f.close()
 
 
def get_program(link, sublink, week_day, epg_file_name):

get_program_info(link, sublink, week_day, epg_file_name);


2. 调用 tvmao 封装的函数,实现多节目epg 获取。

#!/usr/bin/python
# coding: utf-8
# by 黑鸟博客
 
import os
import tvmao
 
link = "https://www.tvmao.com"
 
#中央 , 名字在网址里复制,https://www.guihet.com/
CCTV_prog = ['CCTV1', 'CCTV2', 'CCTV3', 'CCTV4', 'CCTV5', 'CCTV6']
 
epg_path = 'epg/cctv/';
if not os.path.exists(epg_path):
os.makedirs(epg_path)

for prog in CCTV_prog:
epg_name = epg_path+prog+'.txt';
with open(epg_name, "w+") as f:
f.write("")
f.close()
print(prog)
for num in range(1, 8):
sublink = "/program/CCTV-"+prog+"-w"+str(num)+".html";
tvmao.get_program(link, sublink, num, epg_name);
 


#省台 , 名字在网址里复制,https://www.guihet.com/
province_prog = ['AHTV1', 'BTV1', 'CCQTV1', 'FJTV2', 'XMTV5', 'HUNANTV1']
 
epg_path = 'epg/province/';
if not os.path.exists(epg_path):
os.makedirs(epg_path)
for prog in province_prog:
epg_name = epg_path+prog+'.txt';
with open(epg_name, "w+") as f:
f.write("")
f.close()
print(prog)
for num in range(1, 8):
sublink = "/program_satellite/"+prog+"-w"+str(num)+".html";
tvmao.get_program(link, sublink, num, epg_name);
 

剩下的就是根据需求对 EPG 文件的处理了。

说明: 当期的节目路径是手动从网页上复制的, 需要哪个节目,添加到列表中。需要人工确认路径正确性。

下一步要完成的是实现从网页获取节目的路径, 实现全自动化。

点赞
  1. 子午书屋说道:

    老实说,电视猫很不错

发表评论

电子邮件地址不会被公开。 必填项已用*标注