huggingface-go : 加速下载huggingface的模型和数据集

无需消耗机场的流量。0成本高速下载huggingface上动辄数十GB的数据集和模型。

使用方法

wget https://mirror.ghproxy.com/https://github.com/xieincz/huggingface-go/releases/latest/download/huggingface_go_linux_amd64
mv huggingface_go_linux_amd64 huggingface_go
chmod +x huggingface_go

#使用方法: huggingface_go -u huggingface的模型/数据集链接 -f (可选)要保存的位置,可以是文件夹名字也可以是路径 -p (可选)代理链接(如果填写,必须以 / 结尾)
#例如:想要下载 https://huggingface.co/NousResearch/Llama-2-13b-hf 这个模型
#使用方法一:通过镜像站 https://hf-mirror.com 下载
./huggingface_go -u https://hf-mirror.com/NousResearch/Llama-2-13b-hf/tree/main -f /path/to/your/folder

#使用方法二:通过代理链接从huggingface下载
./huggingface_go -u https://huggingface.co/NousResearch/Llama-2-13b-hf/tree/main -f /path/to/your/folder -p https://worker-share-proxy-3y2sz7.xieincz.eu.org/

更多操作系统/CPU架构的版本可以到我的GitHub仓库下载:https://github.com/xieincz/huggingface-go
如果对你有帮助的话,不妨点个star😊

创建自己的代理链接

博主提供的代理链接用的是cloudflare的免费版套餐,每日只有10万次请求的额度,如果有很多人用就很容易超过这个额度。所以创建一个自己的代理链接就很有必要了。根据我自己的使用经验,下载一个数据集大约要消耗80次请求。
要开始下面的步骤,你需要有:

  1. 一个cloudflare账号(打开 https://dash.cloudflare.com/sign-up 来免费注册一个)

  2. 一个绑定到cloudflare账号的域名( https://nic.eu.org/ 可以注册免费的域名,然后将其添加到cloudflare账号中即可)

创建worker

  1. 打开 https://dash.cloudflare.com/?to=/:account/workers-and-pages/create/workers/new

  2. 名称可以自己填个喜欢的。比如 worker-share-proxy-3y2sz7 ,然后点击右下角的“部署”。

  3. 然后打开刚才创建的worker修改代码。
    可以点这个链接直达修改页面(如果你第二步的名称和我的不同,记得修改) https://dash.cloudflare.com/?to=/:account/workers/services/edit/worker-share-proxy-3y2sz7/production

  4. 将原有的代码全部删除,然后将下面这段代码复制粘贴到第三步打开的网页中。然后点击右上角的“保存并部署”。

'use strict'
const ASSET_URL = 'https://hunshcn.github.io/gh-proxy/'
const PREFIX = '/'
const Config = {
    jsdelivr: 0
}
const whiteList = [] // 白名单,路径里面有包含字符的才会通过,e.g. ['/username/']
const PREFLIGHT_INIT = {
    status: 204,
    headers: new Headers({
        'access-control-allow-origin': '*',
        'access-control-allow-methods': 'GET,POST,PUT,PATCH,TRACE,DELETE,HEAD,OPTIONS',
        'access-control-max-age': '1728000',
    }),
}
const exp1 = /^(?:https?:\/\/)?github\.com\/.+?\/.+?\/(?:releases|archive)\/.*$/i
const exp2 = /^(?:https?:\/\/)?github\.com\/.+?\/.+?\/(?:blob|raw)\/.*$/i
const exp3 = /^(?:https?:\/\/)?github\.com\/.+?\/.+?\/(?:info|git-).*$/i
const exp4 = /^(?:https?:\/\/)?raw\.(?:githubusercontent|github)\.com\/.+?\/.+?\/.+?\/.+$/i
const exp5 = /^(?:https?:\/\/)?gist\.(?:githubusercontent|github)\.com\/.+?\/.+?\/.+$/i
const exp6 = /^(?:https?:\/\/)?github\.com\/.+?\/.+?\/tags.*$/i
function makeRes(body, status = 200, headers = {}) {
    headers['access-control-allow-origin'] = '*'
    return new Response(body, {status, headers})
}
function newUrl(urlStr) {
    try {
        return new URL(urlStr)
    } catch (err) {
        return null
    }
}
addEventListener('fetch', e => {
    const ret = fetchHandler(e)
        .catch(err => makeRes('cfworker error:\n' + err.stack, 502))
    e.respondWith(ret)
})
function checkUrl(u) {
    for (let i of [exp1, exp2, exp3, exp4, exp5, exp6]) {
        if (u.search(i) === 0) {
            return true
        }
    }
    return false
}
const customProxyDomainArray=[
    /^(?:https?:\/\/).*$/i
]
function checkCustomUrl(u) {
    for (let i of customProxyDomainArray ) {
        if (u.search(i) === 0) {
            return true
        }
    }
    return false
}
async function fetchHandler(e) {
    const req = e.request
    const urlStr = req.url
    const urlObj = new URL(urlStr)
    let path = urlObj.searchParams.get('q')
    if (path) {
        return Response.redirect('https://' + urlObj.host + PREFIX + path, 301)
    }
    path = urlObj.href.substr(urlObj.origin.length + PREFIX.length).replace(/^https?:\/+/, 'https://')
    if (path.search(exp1) === 0 || path.search(exp5) === 0 || path.search(exp6) === 0 || path.search(exp3) === 0 || path.search(exp4) === 0) {
        return httpHandler(req, path)
    } else if (path.search(exp2) === 0) {
        if (Config.jsdelivr) {
            const newUrl = path.replace('/blob/', '@').replace(/^(?:https?:\/\/)?github\.com/, 'https://cdn.jsdelivr.net/gh')
            return Response.redirect(newUrl, 302)
        } else {
            path = path.replace('/blob/', '/raw/')
            return httpHandler(req, path)
        }
    } else if (path.search(exp4) === 0) {
        const newUrl = path.replace(/(?<=com\/.+?\/.+?)\/(.+?\/)/, '@$1').replace(/^(?:https?:\/\/)?raw\.(?:githubusercontent|github)\.com/, 'https://cdn.jsdelivr.net/gh')
        return Response.redirect(newUrl, 302)
    } else if (checkCustomUrl(path)) {
        return httpHandler(req, path)
    } else {
        return fetch(ASSET_URL + path)
    }
}
function httpHandler(req, pathname) {
    const reqHdrRaw = req.headers
    if (req.method === 'OPTIONS' &&
        reqHdrRaw.has('access-control-request-headers')
    ) {
        return new Response(null, PREFLIGHT_INIT)
    }
    const reqHdrNew = new Headers(reqHdrRaw)
    let urlStr = pathname
    let flag = !Boolean(whiteList.length)
    for (let i of whiteList) {
        if (urlStr.includes(i)) {
            flag = true
            break
        }
    }
    if (!flag) {
        return new Response("blocked", {status: 403})
    }
    if (urlStr.startsWith('github')) {
        urlStr = 'https://' + urlStr
    }
    const urlObj = newUrl(urlStr)
    const reqInit = {
        method: req.method,
        headers: reqHdrNew,
        redirect: 'manual',
        body: req.body
    }
    return proxy(urlObj, reqInit)
}
async function proxy(urlObj, reqInit) {
    const res = await fetch(urlObj.href, reqInit)
    const resHdrOld = res.headers
    const resHdrNew = new Headers(resHdrOld)
    const status = res.status
    if (resHdrNew.has('location')) {
        let _location = resHdrNew.get('location')
        if (checkUrl(_location))
            resHdrNew.set('location', PREFIX + _location)
        else {
            reqInit.redirect = 'follow'
            return proxy(newUrl(_location), reqInit)
        }
    }
    resHdrNew.set('access-control-expose-headers', '*')
    resHdrNew.set('access-control-allow-origin', '*')
    resHdrNew.delete('content-security-policy')
    resHdrNew.delete('content-security-policy-report-only')
    resHdrNew.delete('clear-site-data')
    return new Response(res.body, {
        status,
        headers: resHdrNew,
    })
}
  1. 打开 https://dash.cloudflare.com/?to=/:account/workers/services/view/worker-share-proxy-3y2sz7/production/settings/triggers
    (注意更改链接中的 worker-share-proxy-3y2sz7 ),点击“添加自定义域”,输入 worker-share-proxy-3y2sz7.xieincz.eu.org (这个 xieincz.eu.org 要换成你绑定到cloudflare的域名),然后保存。

  2. 稍等几分钟,打开第五步输入的网址。如果看到一个输入框和一个按钮,恭喜你,已经成功完成了这个教程。
    你可以将此时浏览器的地址栏中的网址(形如 https://worker-share-proxy-3y2sz7.xieincz.eu.org/ )用作huggingface-go的代理链接。