一个从Linux镜像站点递归下载文件的脚本

前面在研究libvirt-test-API这个开源框架时，里面的很多测试用例都是从头开始安装guest系统的，而且默认是从远程的一个URL来安装（URL配置在kickstart配置文件中）。找了国内的镜像（如：http://mirrors.163.com/centos/6.7/os/x86_64），觉得速度还是不如自己本机自己提供镜像文件来得快，所以想下载那里的一些文件，写了这个脚本。

脚本看起来并不太复杂，主要思路是先找到所有文件的URL（包括所有子目录中的文件）然后下载。我采用了urls_dict这个字典，以path为key，value中有它这个path下所有的文件名列表(files)、父目录(parent)、子目录列表(sub_dirs)，先递归算法生成这个urls_dict，然后逐个目录下载其文件（下载时只用到了urls_dict中的key和value中的files）。用到了requests库来发送http请求，用到了BeautifulSoup来解析html文件。

主要在写get_urls_dict这个递归函数时，最开始思路不太清晰，后来只要把递归退出条件、两种条件下不同的递归方式弄清楚，代码写起来就容易了。

好吧，代码在https://github.com/smilejay/python/blob/master/py2015/download_repos.py，如下（写得不太好，欢迎拍砖）：

import requests
from bs4 import BeautifulSoup
import os


class repos(object):

    """download linux repos from mirrors' site."""

    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh) Gecko/20100101 Firefox/42.0'}
    urls_dict = {}

    def __init__(self, base_url, base_dir):
        super(repos, self).__init__()
        self.base_url = base_url
        self.base_dir = base_dir

    def download(self):
        for i in self.urls_dict:
            for j in self.urls_dict[i]['files']:
                url = self.base_url + i + j
                print url
                request = requests.get(url, headers=self.headers)
                if request.ok:
                    file_location = self.base_dir + i + j
                    print file_location
                    if not os.path.exists(self.base_dir + i):
                        os.makedirs(self.base_dir + i)
                    with open(file_location, "wb") as the_file:
                        the_file.write(request.content)

    def get_urls_dict(self, path='/', parent=None):
        if path not in self.urls_dict:
            self.urls_dict[path] = {
                'parent': parent, 'sub_dirs': [], 'files': []}
            url = self.base_url + path
            request = requests.get(url, headers=self.headers)
            if request.ok:
                soup = BeautifulSoup(request.text, 'html.parser')
                for url in soup.find_all('a'):
                    url_text = url.get('href')
                    if url_text.endswith('/') and url_text != '../':
                        self.urls_dict[path]['sub_dirs'].append(url_text)
                    elif not url_text.endswith('/'):
                        self.urls_dict[path]['files'].append(url_text)
        if self.urls_dict[path]['parent'] == None and len(self.urls_dict[path]['sub_dirs']) == 0:
            pass
        elif len(self.urls_dict[path]['sub_dirs']) != 0:
            for i in self.urls_dict[path]['sub_dirs']:
                return self.get_urls_dict(path=path + i, parent=path)
        elif self.urls_dict[path]['parent'] != None and len(self.urls_dict[path]['sub_dirs']) == 0:
            self.urls_dict[self.urls_dict[path]['parent']][
                'sub_dirs'].remove(path.split('/')[-2] + '/')
            return self.get_urls_dict(path=self.urls_dict[path]['parent'],
                                      parent=self.urls_dict[self.urls_dict[path]['parent']]['parent'])


if __name__ == '__main__':
    # url = 'http://mirrors.aliyun.com/centos/6.7/os/x86_64'
    url = 'http://mirrors.163.com/centos/6.7/os/x86_64'
    the_dir = '/tmp/centos6u7'
    repo = repos(url, the_dir)
    repo.get_urls_dict()
    # print repo.urls_dict
    repo.download()

import requests

from bs4 import BeautifulSoup

import os

class repos(object):

"""download linux repos from mirrors' site."""

headers = {

'User-Agent': 'Mozilla/5.0 (Macintosh) Gecko/20100101 Firefox/42.0'}

urls_dict = {}

def __init__(self, base_url, base_dir):

super(repos, self).__init__()

self.base_url = base_url

self.base_dir = base_dir

def download(self):

for i in self.urls_dict:

for j in self.urls_dict[i]['files']:

url = self.base_url + i + j

print url

request = requests.get(url, headers=self.headers)

if request.ok:

file_location = self.base_dir + i + j

print file_location

if not os.path.exists(self.base_dir + i):

os.makedirs(self.base_dir + i)

with open(file_location, "wb") as the_file:

the_file.write(request.content)

def get_urls_dict(self, path='/', parent=None):

if path not in self.urls_dict:

self.urls_dict[path] = {

'parent': parent, 'sub_dirs': [], 'files': []}

url = self.base_url + path

request = requests.get(url, headers=self.headers)

if request.ok:

soup = BeautifulSoup(request.text, 'html.parser')

for url in soup.find_all('a'):

url_text = url.get('href')

if url_text.endswith('/') and url_text != '../':

self.urls_dict[path]['sub_dirs'].append(url_text)

elif not url_text.endswith('/'):

self.urls_dict[path]['files'].append(url_text)

if self.urls_dict[path]['parent'] == None and len(self.urls_dict[path]['sub_dirs']) == 0:

pass

elif len(self.urls_dict[path]['sub_dirs']) != 0:

for i in self.urls_dict[path]['sub_dirs']:

return self.get_urls_dict(path=path + i, parent=path)

elif self.urls_dict[path]['parent'] != None and len(self.urls_dict[path]['sub_dirs']) == 0:

self.urls_dict[self.urls_dict[path]['parent']][

'sub_dirs'].remove(path.split('/')[-2] + '/')

return self.get_urls_dict(path=self.urls_dict[path]['parent'],

parent=self.urls_dict[self.urls_dict[path]['parent']]['parent'])

if __name__ == '__main__':

# url = 'http://mirrors.aliyun.com/centos/6.7/os/x86_64'

url = 'http://mirrors.163.com/centos/6.7/os/x86_64'

the_dir = '/tmp/centos6u7'

repo = repos(url, the_dir)

repo.get_urls_dict()

# print repo.urls_dict

repo.download()

一	二	三	四	五	六	日
« 9月				12月 »
						1
2	3	4	5	6	7	8
9	10	11	12	13	14	15
16	17	18	19	20	21	22
23	24	25	26	27	28	29
30

Related posts:

master

发表评论 取消回复

发表评论取消回复