无需消耗机场的流量。0成本高速下载huggingface上动辄数十GB的数据集和模型。
使用方法
wget https://mirror.ghproxy.com/https://github.com/xieincz/huggingface-go/releases/latest/download/huggingface_go_linux_amd64
mv huggingface_go_linux_amd64 huggingface_go
chmod +x huggingface_go
#使用方法: huggingface_go -u huggingface的模型/数据集链接 -f (可选)要保存的位置,可以是文件夹名字也可以是路径 -p (可选)代理链接(如果填写,必须以 / 结尾)
#例如:想要下载 https://huggingface.co/NousResearch/Llama-2-13b-hf 这个模型
#使用方法一:通过镜像站 https://hf-mirror.com 下载
./huggingface_go -u https://hf-mirror.com/NousResearch/Llama-2-13b-hf/tree/main -f /path/to/your/folder
#使用方法二:通过代理链接从huggingface下载
./huggingface_go -u https://huggingface.co/NousResearch/Llama-2-13b-hf/tree/main -f /path/to/your/folder -p https://worker-share-proxy-3y2sz7.xieincz.eu.org/
更多操作系统/CPU架构的版本可以到我的GitHub仓库下载:https://github.com/xieincz/huggingface-go
如果对你有帮助的话,不妨点个star😊
创建自己的代理链接
博主提供的代理链接用的是cloudflare的免费版套餐,每日只有10万次请求的额度,如果有很多人用就很容易超过这个额度。所以创建一个自己的代理链接就很有必要了。根据我自己的使用经验,下载一个数据集大约要消耗80次请求。
要开始下面的步骤,你需要有:
-
一个cloudflare账号(打开 https://dash.cloudflare.com/sign-up 来免费注册一个)
-
一个绑定到cloudflare账号的域名( https://nic.eu.org/ 可以注册免费的域名,然后将其添加到cloudflare账号中即可)
创建worker
-
打开 https://dash.cloudflare.com/?to=/:account/workers-and-pages/create/workers/new
-
名称可以自己填个喜欢的。比如
worker-share-proxy-3y2sz7
,然后点击右下角的“部署”。 -
然后打开刚才创建的worker修改代码。
可以点这个链接直达修改页面(如果你第二步的名称和我的不同,记得修改) https://dash.cloudflare.com/?to=/:account/workers/services/edit/worker-share-proxy-3y2sz7/production -
将原有的代码全部删除,然后将下面这段代码复制粘贴到第三步打开的网页中。然后点击右上角的“保存并部署”。
'use strict'
const ASSET_URL = 'https://hunshcn.github.io/gh-proxy/'
const PREFIX = '/'
const Config = {
jsdelivr: 0
}
const whiteList = [] // 白名单,路径里面有包含字符的才会通过,e.g. ['/username/']
const PREFLIGHT_INIT = {
status: 204,
headers: new Headers({
'access-control-allow-origin': '*',
'access-control-allow-methods': 'GET,POST,PUT,PATCH,TRACE,DELETE,HEAD,OPTIONS',
'access-control-max-age': '1728000',
}),
}
const exp1 = /^(?:https?:\/\/)?github\.com\/.+?\/.+?\/(?:releases|archive)\/.*$/i
const exp2 = /^(?:https?:\/\/)?github\.com\/.+?\/.+?\/(?:blob|raw)\/.*$/i
const exp3 = /^(?:https?:\/\/)?github\.com\/.+?\/.+?\/(?:info|git-).*$/i
const exp4 = /^(?:https?:\/\/)?raw\.(?:githubusercontent|github)\.com\/.+?\/.+?\/.+?\/.+$/i
const exp5 = /^(?:https?:\/\/)?gist\.(?:githubusercontent|github)\.com\/.+?\/.+?\/.+$/i
const exp6 = /^(?:https?:\/\/)?github\.com\/.+?\/.+?\/tags.*$/i
function makeRes(body, status = 200, headers = {}) {
headers['access-control-allow-origin'] = '*'
return new Response(body, {status, headers})
}
function newUrl(urlStr) {
try {
return new URL(urlStr)
} catch (err) {
return null
}
}
addEventListener('fetch', e => {
const ret = fetchHandler(e)
.catch(err => makeRes('cfworker error:\n' + err.stack, 502))
e.respondWith(ret)
})
function checkUrl(u) {
for (let i of [exp1, exp2, exp3, exp4, exp5, exp6]) {
if (u.search(i) === 0) {
return true
}
}
return false
}
const customProxyDomainArray=[
/^(?:https?:\/\/).*$/i
]
function checkCustomUrl(u) {
for (let i of customProxyDomainArray ) {
if (u.search(i) === 0) {
return true
}
}
return false
}
async function fetchHandler(e) {
const req = e.request
const urlStr = req.url
const urlObj = new URL(urlStr)
let path = urlObj.searchParams.get('q')
if (path) {
return Response.redirect('https://' + urlObj.host + PREFIX + path, 301)
}
path = urlObj.href.substr(urlObj.origin.length + PREFIX.length).replace(/^https?:\/+/, 'https://')
if (path.search(exp1) === 0 || path.search(exp5) === 0 || path.search(exp6) === 0 || path.search(exp3) === 0 || path.search(exp4) === 0) {
return httpHandler(req, path)
} else if (path.search(exp2) === 0) {
if (Config.jsdelivr) {
const newUrl = path.replace('/blob/', '@').replace(/^(?:https?:\/\/)?github\.com/, 'https://cdn.jsdelivr.net/gh')
return Response.redirect(newUrl, 302)
} else {
path = path.replace('/blob/', '/raw/')
return httpHandler(req, path)
}
} else if (path.search(exp4) === 0) {
const newUrl = path.replace(/(?<=com\/.+?\/.+?)\/(.+?\/)/, '@$1').replace(/^(?:https?:\/\/)?raw\.(?:githubusercontent|github)\.com/, 'https://cdn.jsdelivr.net/gh')
return Response.redirect(newUrl, 302)
} else if (checkCustomUrl(path)) {
return httpHandler(req, path)
} else {
return fetch(ASSET_URL + path)
}
}
function httpHandler(req, pathname) {
const reqHdrRaw = req.headers
if (req.method === 'OPTIONS' &&
reqHdrRaw.has('access-control-request-headers')
) {
return new Response(null, PREFLIGHT_INIT)
}
const reqHdrNew = new Headers(reqHdrRaw)
let urlStr = pathname
let flag = !Boolean(whiteList.length)
for (let i of whiteList) {
if (urlStr.includes(i)) {
flag = true
break
}
}
if (!flag) {
return new Response("blocked", {status: 403})
}
if (urlStr.startsWith('github')) {
urlStr = 'https://' + urlStr
}
const urlObj = newUrl(urlStr)
const reqInit = {
method: req.method,
headers: reqHdrNew,
redirect: 'manual',
body: req.body
}
return proxy(urlObj, reqInit)
}
async function proxy(urlObj, reqInit) {
const res = await fetch(urlObj.href, reqInit)
const resHdrOld = res.headers
const resHdrNew = new Headers(resHdrOld)
const status = res.status
if (resHdrNew.has('location')) {
let _location = resHdrNew.get('location')
if (checkUrl(_location))
resHdrNew.set('location', PREFIX + _location)
else {
reqInit.redirect = 'follow'
return proxy(newUrl(_location), reqInit)
}
}
resHdrNew.set('access-control-expose-headers', '*')
resHdrNew.set('access-control-allow-origin', '*')
resHdrNew.delete('content-security-policy')
resHdrNew.delete('content-security-policy-report-only')
resHdrNew.delete('clear-site-data')
return new Response(res.body, {
status,
headers: resHdrNew,
})
}
-
打开 https://dash.cloudflare.com/?to=/:account/workers/services/view/worker-share-proxy-3y2sz7/production/settings/triggers
(注意更改链接中的worker-share-proxy-3y2sz7
),点击“添加自定义域”,输入worker-share-proxy-3y2sz7.xieincz.eu.org
(这个xieincz.eu.org
要换成你绑定到cloudflare的域名),然后保存。 -
稍等几分钟,打开第五步输入的网址。如果看到一个输入框和一个按钮,恭喜你,已经成功完成了这个教程。
你可以将此时浏览器的地址栏中的网址(形如https://worker-share-proxy-3y2sz7.xieincz.eu.org/
)用作huggingface-go的代理链接。