更换文档检测模型

2024-08-27 14:42:45 +08:00
parent aea6f19951
commit 1514e09c40
2072 changed files with 254336 additions and 4967 deletions
--- a/paddle_detection/deploy/pipeline/download.py
+++ b/paddle_detection/deploy/pipeline/download.py
@@ -0,0 +1,340 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os, sys
+import os.path as osp
+import hashlib
+import requests
+import shutil
+import tqdm
+import time
+import tarfile
+import zipfile
+from paddle.utils.download import _get_unique_endpoints
+
+PPDET_WEIGHTS_DOWNLOAD_URL_PREFIX = 'https://paddledet.bj.bcebos.com/'
+
+DOWNLOAD_RETRY_LIMIT = 3
+
+WEIGHTS_HOME = osp.expanduser("~/.cache/paddle/infer_weights")
+
+MODEL_URL_MD5_DICT = {
+    'https://bj.bcebos.com/v1/paddledet/models/pipeline/ch_PP-OCRv3_det_infer.tar.gz':
+    '1b8eae0f098635699bd4e8bccf3067a7',
+    'https://bj.bcebos.com/v1/paddledet/models/pipeline/ch_PP-OCRv3_rec_infer.tar.gz':
+    '64fa0e0701efd93c7db52a9b685b3de6',
+    "https://bj.bcebos.com/v1/paddledet/models/pipeline/mot_ppyoloe_l_36e_ppvehicle.zip":
+    "3859d1a26e0c498285c2374b1a347013",
+    "https://bj.bcebos.com/v1/paddledet/models/pipeline/mot_ppyoloe_s_36e_ppvehicle.zip":
+    "4ed58b546be2a76d8ccbb138f64874ac",
+    "https://bj.bcebos.com/v1/paddledet/models/pipeline/dark_hrnet_w32_256x192.zip":
+    "a20d5f6ca087bff0e9f2b18df45a36f2",
+    "https://bj.bcebos.com/v1/paddledet/models/pipeline/PPLCNet_x1_0_person_attribute_945_infer.zip":
+    "1dfb161bf12bbc1365b2ed6866674483",
+    "https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSM_fight.zip":
+    "5d4609142501258608bf0a1445eedaba",
+    "https://bj.bcebos.com/v1/paddledet/models/pipeline/STGCN.zip":
+    "cf1c3c4bae90b975accb954d13129ea4",
+    "https://bj.bcebos.com/v1/paddledet/models/pipeline/ppyoloe_crn_s_80e_smoking_visdrone.zip":
+    "4cd12ae55be8f0eb2b90c08ac3b48218",
+    "https://bj.bcebos.com/v1/paddledet/models/pipeline/PPHGNet_tiny_calling_halfbody.zip":
+    "cf86b87ace97540dace6ef08e62b584a",
+    "https://bj.bcebos.com/v1/paddledet/models/pipeline/reid_model.zip":
+    "fdc4dac38393b8e2b5921c1e1fdd5315"
+}
+
+
+def is_url(path):
+    """
+    Whether path is URL.
+    Args:
+        path (string): URL string or not.
+    """
+    return path.startswith('http://') \
+            or path.startswith('https://') \
+            or path.startswith('ppdet://')
+
+
+def parse_url(url):
+    url = url.replace("ppdet://", PPDET_WEIGHTS_DOWNLOAD_URL_PREFIX)
+    return url
+
+
+def map_path(url, root_dir, path_depth=1):
+    # parse path after download to decompress under root_dir
+    assert path_depth > 0, "path_depth should be a positive integer"
+    dirname = url
+    for _ in range(path_depth):
+        dirname = osp.dirname(dirname)
+    fpath = osp.relpath(url, dirname)
+
+    zip_formats = ['.zip', '.tar', '.gz']
+    for zip_format in zip_formats:
+        fpath = fpath.replace(zip_format, '')
+    return osp.join(root_dir, fpath)
+
+
+def _md5check(fullname, md5sum=None):
+    if md5sum is None:
+        return True
+
+    md5 = hashlib.md5()
+    with open(fullname, 'rb') as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            md5.update(chunk)
+    calc_md5sum = md5.hexdigest()
+
+    if calc_md5sum != md5sum:
+        return False
+    return True
+
+
+def _check_exist_file_md5(filename, md5sum, url):
+    return _md5check(filename, md5sum)
+
+
+def _download(url, path, md5sum=None):
+    """
+    Download from url, save to path.
+    url (str): download url
+    path (str): download to given path
+    """
+    if not osp.exists(path):
+        os.makedirs(path)
+
+    fname = osp.split(url)[-1]
+    fullname = osp.join(path, fname)
+    retry_cnt = 0
+    while not (osp.exists(fullname) and _check_exist_file_md5(fullname, md5sum,
+                                                              url)):
+        if retry_cnt < DOWNLOAD_RETRY_LIMIT:
+            retry_cnt += 1
+        else:
+            raise RuntimeError("Download from {} failed. "
+                               "Retry limit reached".format(url))
+
+        # NOTE: windows path join may incur \, which is invalid in url
+        if sys.platform == "win32":
+            url = url.replace('\\', '/')
+
+        req = requests.get(url, stream=True)
+        if req.status_code != 200:
+            raise RuntimeError("Downloading from {} failed with code "
+                               "{}!".format(url, req.status_code))
+
+        # For protecting download interupted, download to
+        # tmp_fullname firstly, move tmp_fullname to fullname
+        # after download finished
+        tmp_fullname = fullname + "_tmp"
+        total_size = req.headers.get('content-length')
+        with open(tmp_fullname, 'wb') as f:
+            if total_size:
+                for chunk in tqdm.tqdm(
+                        req.iter_content(chunk_size=1024),
+                        total=(int(total_size) + 1023) // 1024,
+                        unit='KB'):
+                    f.write(chunk)
+            else:
+                for chunk in req.iter_content(chunk_size=1024):
+                    if chunk:
+                        f.write(chunk)
+        shutil.move(tmp_fullname, fullname)
+    return fullname
+
+
+def _download_dist(url, path, md5sum=None):
+    env = os.environ
+    if 'PADDLE_TRAINERS_NUM' in env and 'PADDLE_TRAINER_ID' in env:
+        trainer_id = int(env['PADDLE_TRAINER_ID'])
+        num_trainers = int(env['PADDLE_TRAINERS_NUM'])
+        if num_trainers <= 1:
+            return _download(url, path, md5sum)
+        else:
+            fname = osp.split(url)[-1]
+            fullname = osp.join(path, fname)
+            lock_path = fullname + '.download.lock'
+
+            if not osp.isdir(path):
+                os.makedirs(path)
+
+            if not osp.exists(fullname):
+                from paddle.distributed import ParallelEnv
+                unique_endpoints = _get_unique_endpoints(ParallelEnv()
+                                                         .trainer_endpoints[:])
+                with open(lock_path, 'w'):  # touch    
+                    os.utime(lock_path, None)
+                if ParallelEnv().current_endpoint in unique_endpoints:
+                    _download(url, path, md5sum)
+                    os.remove(lock_path)
+                else:
+                    while os.path.exists(lock_path):
+                        time.sleep(0.5)
+            return fullname
+    else:
+        return _download(url, path, md5sum)
+
+
+def _move_and_merge_tree(src, dst):
+    """
+    Move src directory to dst, if dst is already exists,
+    merge src to dst
+    """
+    if not osp.exists(dst):
+        shutil.move(src, dst)
+    elif osp.isfile(src):
+        shutil.move(src, dst)
+    else:
+        for fp in os.listdir(src):
+            src_fp = osp.join(src, fp)
+            dst_fp = osp.join(dst, fp)
+            if osp.isdir(src_fp):
+                if osp.isdir(dst_fp):
+                    _move_and_merge_tree(src_fp, dst_fp)
+                else:
+                    shutil.move(src_fp, dst_fp)
+            elif osp.isfile(src_fp) and \
+                    not osp.isfile(dst_fp):
+                shutil.move(src_fp, dst_fp)
+
+
+def _decompress(fname):
+    """
+    Decompress for zip and tar file
+    """
+
+    # For protecting decompressing interupted,
+    # decompress to fpath_tmp directory firstly, if decompress
+    # successed, move decompress files to fpath and delete
+    # fpath_tmp and remove download compress file.
+    fpath = osp.split(fname)[0]
+    fpath_tmp = osp.join(fpath, 'tmp')
+    if osp.isdir(fpath_tmp):
+        shutil.rmtree(fpath_tmp)
+        os.makedirs(fpath_tmp)
+
+    if fname.find('tar') >= 0:
+        with tarfile.open(fname) as tf:
+            tf.extractall(path=fpath_tmp)
+    elif fname.find('zip') >= 0:
+        with zipfile.ZipFile(fname) as zf:
+            zf.extractall(path=fpath_tmp)
+    elif fname.find('.txt') >= 0:
+        return
+    else:
+        raise TypeError("Unsupport compress file type {}".format(fname))
+
+    for f in os.listdir(fpath_tmp):
+        src_dir = osp.join(fpath_tmp, f)
+        dst_dir = osp.join(fpath, f)
+        _move_and_merge_tree(src_dir, dst_dir)
+
+    shutil.rmtree(fpath_tmp)
+    os.remove(fname)
+
+
+def _decompress_dist(fname):
+    env = os.environ
+    if 'PADDLE_TRAINERS_NUM' in env and 'PADDLE_TRAINER_ID' in env:
+        trainer_id = int(env['PADDLE_TRAINER_ID'])
+        num_trainers = int(env['PADDLE_TRAINERS_NUM'])
+        if num_trainers <= 1:
+            _decompress(fname)
+        else:
+            lock_path = fname + '.decompress.lock'
+            from paddle.distributed import ParallelEnv
+            unique_endpoints = _get_unique_endpoints(ParallelEnv()
+                                                     .trainer_endpoints[:])
+            # NOTE(dkp): _decompress_dist always performed after
+            # _download_dist, in _download_dist sub-trainers is waiting
+            # for download lock file release with sleeping, if decompress
+            # prograss is very fast and finished with in the sleeping gap
+            # time, e.g in tiny dataset such as coco_ce, spine_coco, main
+            # trainer may finish decompress and release lock file, so we
+            # only craete lock file in main trainer and all sub-trainer
+            # wait 1s for main trainer to create lock file, for 1s is
+            # twice as sleeping gap, this waiting time can keep all
+            # trainer pipeline in order
+            # **change this if you have more elegent methods**
+            if ParallelEnv().current_endpoint in unique_endpoints:
+                with open(lock_path, 'w'):  # touch    
+                    os.utime(lock_path, None)
+                _decompress(fname)
+                os.remove(lock_path)
+            else:
+                time.sleep(1)
+                while os.path.exists(lock_path):
+                    time.sleep(0.5)
+    else:
+        _decompress(fname)
+
+
+def get_path(url, root_dir=WEIGHTS_HOME, md5sum=None, check_exist=True):
+    """ Download from given url to root_dir.
+    if file or directory specified by url is exists under
+    root_dir, return the path directly, otherwise download
+    from url and decompress it, return the path.
+    url (str): download url
+    root_dir (str): root dir for downloading
+    md5sum (str): md5 sum of download package
+    """
+    # parse path after download to decompress under root_dir
+    fullpath = map_path(url, root_dir)
+
+    # For same zip file, decompressed directory name different
+    # from zip file name, rename by following map
+    decompress_name_map = {"ppTSM_fight": "ppTSM", }
+    for k, v in decompress_name_map.items():
+        if fullpath.find(k) >= 0:
+            fullpath = osp.join(osp.split(fullpath)[0], v)
+
+    if osp.exists(fullpath) and check_exist:
+        if not osp.isfile(fullpath) or \
+                _check_exist_file_md5(fullpath, md5sum, url):
+            return fullpath, True
+        else:
+            os.remove(fullpath)
+
+    fullname = _download_dist(url, root_dir, md5sum)
+
+    # new weights format which postfix is 'pdparams' not
+    # need to decompress
+    if osp.splitext(fullname)[-1] not in ['.pdparams', '.yml']:
+        _decompress_dist(fullname)
+
+    return fullpath, False
+
+
+def get_weights_path(url):
+    """Get weights path from WEIGHTS_HOME, if not exists,
+    download it from url.
+    """
+    url = parse_url(url)
+    md5sum = None
+    if url in MODEL_URL_MD5_DICT.keys():
+        md5sum = MODEL_URL_MD5_DICT[url]
+    path, _ = get_path(url, WEIGHTS_HOME, md5sum)
+    return path
+
+
+def auto_download_model(model_path):
+    # auto download
+    if is_url(model_path):
+        weight = get_weights_path(model_path)
+        return weight
+    return None
+
+
+if __name__ == "__main__":
+    model_path = "https://bj.bcebos.com/v1/paddledet/models/pipeline/mot_ppyoloe_l_36e_pipeline.zip"
+    auto_download_model(model_path)