Public Datasets

ImageNet

https://image-net.org/data/ILSVRC/2012/ILSVRC2012_img_train.tar

https://image-net.org/challenges/LSVRC/2012/2012-downloads.php

1
mkdir train && tar -xvf ILSVRC2012_img_train.tar -C train && for x in `ls train/*tar`; do fn=train/`basename $x .tar`; mkdir $fn; tar -xvf $x -C $fn; rm -f $fn.tar; done

整理val到分类文件夹

https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh

MIMIC-CXR

https://physionet.org/content/mimic-cxr-jpg/2.1.0/

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# https://github.com/baeseongsu/mimic-cxr-vqa/blob/master/download_images.sh
#!/bin/bash

# This script downloads the MIMIC-CXR-VQA dataset images after gathering image paths from JSON files.

# Capture the start time
start_time=$(date +%s)

# Prompt for PhysioNet credentials
echo "Enter your PhysioNet credentials"
read -p "Username: " USERNAME
read -s -p "Password: " PASSWORD
echo

# Base URL for the MIMIC-CXR dataset
MIMIC_CXR_JPG_DIR="https://physionet.org/files/mimic-cxr-jpg/2.0.0"

# wget parameters for downloading files
WGET_PARAMS="-r -N -c -np --user $USERNAME --password $PASSWORD"

# Function to download files
download() {
    local file_url=$1
    wget $WGET_PARAMS "$file_url" || { echo "Error: Failed to download $file_url" >&2; exit 1; }
}

# Function to extract image paths from JSON files
get_image_paths() {
    local json_file=$1
    python -c "import json; f=open('$json_file'); data=json.load(f); print('\n'.join([item['image_path'] for item in data]))"
}

# Gather image paths from JSON dataset files
image_paths_train=$(get_image_paths 'mimiccxrvqa/dataset/train.json')
image_paths_valid=$(get_image_paths 'mimiccxrvqa/dataset/valid.json')
image_paths_test=$(get_image_paths 'mimiccxrvqa/dataset/test.json')

# Combine paths from train, valid, and test
image_paths=$(echo -e "$image_paths_train\n$image_paths_valid\n$image_paths_test")

# Remove duplicates and convert to an array
readarray -t arr <<<"$(echo "$image_paths" | sort -u)"

# Display the total number of unique images
echo "Total number of unique images: ${#arr[@]}"

# Download the images
echo "Downloading images..."
for image_path in "${arr[@]}"; do
    echo "Downloading $image_path"
    download "$MIMIC_CXR_JPG_DIR/files/$image_path"
done
echo "All images have been successfully downloaded."

# Capture the end time and calculate runtime
end_time=$(date +%s)
runtime=$((end_time - start_time))

# Display the script runtime
echo "Script runtime: $runtime seconds"

ChestX-ray14

https://www.kaggle.com/datasets/nih-chest-xrays/data

1
2
3
#!/bin/bash
curl -L -o ~/Downloads/data.zip\
  https://www.kaggle.com/api/v1/datasets/download/nih-chest-xrays/data

Data ~45G