From 9a64d1821768057bac123e8002380a6f3b12a683 Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Fri, 28 Jun 2024 09:54:05 +0200 Subject: [PATCH 1/2] Default to Ray container image provided by OpenShift AI --- .../additional-demos/hf_interactive.ipynb | 1040 +++++------ .../additional-demos/local_interactive.ipynb | 2 +- .../additional-demos/ray_job_client.ipynb | 2 +- demo-notebooks/guided-demos/0_basic_ray.ipynb | 2 +- .../guided-demos/1_cluster_job_client.ipynb | 2 +- .../guided-demos/2_basic_interactive.ipynb | 2 +- .../notebook-ex-outputs/0_basic_ray.ipynb | 46 +- .../1_cluster_job_client.ipynb | 2 +- .../2_basic_interactive.ipynb | 1576 ++++++++--------- .../notebook-ex-outputs/interactivetest.yaml | 4 +- .../notebook-ex-outputs/jobtest.yaml | 4 +- .../notebook-ex-outputs/raytest.yaml | 4 +- .../preview_nbs/0_basic_ray.ipynb | 2 +- .../preview_nbs/1_cluster_job_client.ipynb | 2 +- .../preview_nbs/2_basic_interactive.ipynb | 2 +- docs/cluster-configuration.md | 2 +- poetry.lock | 136 +- pyproject.toml | 2 +- .../templates/base-template.yaml | 6 +- tests/e2e/support.py | 2 +- tests/test-case-bad.yaml | 6 +- tests/test-case-no-mcad.yamls | 6 +- tests/test-case.yaml | 6 +- tests/test-default-appwrapper.yaml | 6 +- tests/unit_test.py | 22 +- tests/unit_test_support.py | 2 +- 26 files changed, 1450 insertions(+), 1438 deletions(-) diff --git a/demo-notebooks/additional-demos/hf_interactive.ipynb b/demo-notebooks/additional-demos/hf_interactive.ipynb index 3db8c11b..ad552451 100644 --- a/demo-notebooks/additional-demos/hf_interactive.ipynb +++ b/demo-notebooks/additional-demos/hf_interactive.ipynb @@ -98,7 +98,7 @@ " max_cpus=8, \n", " min_memory=16, \n", " max_memory=16, \n", - " image=\"quay.io/project-codeflare/ray:2.20.0-py39-cu118\",\n", + " image=\"quay.io/rhoai/ray:2.23.0-py39-cu121\",\n", " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n", " # local_queue=\"local-queue-name\" # Specify the local queue manually\n", " ))" @@ -153,13 +153,13 @@ ], "text/plain": [ "╭─────────────────────────╮\n", - "│ \u001b[3m \u001b[0m\u001b[1;3m 🚀 List of CodeFlare\u001b[0m\u001b[3m \u001b[0m │\n", - "│ \u001b[3m \u001b[0m\u001b[1;3mclusters in queue🚀\u001b[0m\u001b[3m \u001b[0m │\n", + "│ \u001B[3m \u001B[0m\u001B[1;3m 🚀 List of CodeFlare\u001B[0m\u001B[3m \u001B[0m │\n", + "│ \u001B[3m \u001B[0m\u001B[1;3mclusters in queue🚀\u001B[0m\u001B[3m \u001B[0m │\n", "│ +-----------+---------+ │\n", - "│ |\u001b[1m \u001b[0m\u001b[1mName \u001b[0m\u001b[1m \u001b[0m|\u001b[1m \u001b[0m\u001b[1mStatus \u001b[0m\u001b[1m \u001b[0m| │\n", + "│ |\u001B[1m \u001B[0m\u001B[1mName \u001B[0m\u001B[1m \u001B[0m|\u001B[1m \u001B[0m\u001B[1mStatus \u001B[0m\u001B[1m \u001B[0m| │\n", "│ +===========+=========+ │\n", - "│ |\u001b[36m \u001b[0m\u001b[36mhfgputest\u001b[0m\u001b[36m \u001b[0m|\u001b[35m \u001b[0m\u001b[35mpending\u001b[0m\u001b[35m \u001b[0m| │\n", - "│ |\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m|\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m| │\n", + "│ |\u001B[36m \u001B[0m\u001B[36mhfgputest\u001B[0m\u001B[36m \u001B[0m|\u001B[35m \u001B[0m\u001B[35mpending\u001B[0m\u001B[35m \u001B[0m| │\n", + "│ |\u001B[36m \u001B[0m\u001B[36m \u001B[0m\u001B[36m \u001B[0m|\u001B[35m \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m| │\n", "│ +-----------+---------+ │\n", "╰─────────────────────────╯\n" ] @@ -240,22 +240,22 @@ "\n" ], "text/plain": [ - "\u001b[3m \u001b[0m\u001b[1;3m 🚀 List of CodeFlare clusters 🚀\u001b[0m\u001b[3m \u001b[0m\n", - "\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\n", + "\u001B[3m \u001B[0m\u001B[1;3m 🚀 List of CodeFlare clusters 🚀\u001B[0m\u001B[3m \u001B[0m\n", + "\u001B[1m \u001B[0m\u001B[1m \u001B[0m\u001B[1m \u001B[0m\n", " ╭────────────────────────────────────────────────────────────────╮ \n", - " │ \u001b[1;37;42mOwner\u001b[0m │ \n", - " │ \u001b[1;4mhfgputest\u001b[0m Active ✅ │ \n", + " │ \u001B[1;37;42mOwner\u001B[0m │ \n", + " │ \u001B[1;4mhfgputest\u001B[0m Active ✅ │ \n", " │ │ \n", - " │ \u001b[1mURI:\u001b[0m ray://hfgputest-head-svc.default.svc:10001 │ \n", + " │ \u001B[1mURI:\u001B[0m ray://hfgputest-head-svc.default.svc:10001 │ \n", " │ │ \n", - " │ \u001b]8;id=552692;ray-dashboard-hfgputest-default.apps.prepfullinstall.psap.aws.rhperfscale.org\u001b\\\u001b[4;34mDashboard🔗\u001b[0m\u001b]8;;\u001b\\ │ \n", + " │ \u001B]8;id=552692;ray-dashboard-hfgputest-default.apps.prepfullinstall.psap.aws.rhperfscale.org\u001B\\\u001B[4;34mDashboard🔗\u001B[0m\u001B]8;;\u001B\\ │ \n", " │ │ \n", - " │ \u001b[3m Cluster Resources \u001b[0m │ \n", + " │ \u001B[3m Cluster Resources \u001B[0m │ \n", " │ ╭─ Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │ \n", - " │ │ \u001b[1m \u001b[0m\u001b[1mMin\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mMax\u001b[0m\u001b[1m \u001b[0m │ │ \u001b[1m \u001b[0m\u001b[1mMemory \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mCPU \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mGPU \u001b[0m\u001b[1m \u001b[0m │ │ \n", - " │ │ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m │ │ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m │ │ \n", - " │ │ \u001b[36m \u001b[0m\u001b[36m1 \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m1 \u001b[0m\u001b[35m \u001b[0m │ │ \u001b[36m \u001b[0m\u001b[36m16G~16G \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m8 \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m4 \u001b[0m\u001b[35m \u001b[0m │ │ \n", - " │ │ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m │ │ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m │ │ \n", + " │ │ \u001B[1m \u001B[0m\u001B[1mMin\u001B[0m\u001B[1m \u001B[0m\u001B[1m \u001B[0m\u001B[1mMax\u001B[0m\u001B[1m \u001B[0m │ │ \u001B[1m \u001B[0m\u001B[1mMemory \u001B[0m\u001B[1m \u001B[0m\u001B[1m \u001B[0m\u001B[1mCPU \u001B[0m\u001B[1m \u001B[0m\u001B[1m \u001B[0m\u001B[1mGPU \u001B[0m\u001B[1m \u001B[0m │ │ \n", + " │ │ \u001B[36m \u001B[0m\u001B[36m \u001B[0m\u001B[36m \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m │ │ \u001B[36m \u001B[0m\u001B[36m \u001B[0m\u001B[36m \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m │ │ \n", + " │ │ \u001B[36m \u001B[0m\u001B[36m1 \u001B[0m\u001B[36m \u001B[0m\u001B[35m \u001B[0m\u001B[35m1 \u001B[0m\u001B[35m \u001B[0m │ │ \u001B[36m \u001B[0m\u001B[36m16G~16G \u001B[0m\u001B[36m \u001B[0m\u001B[35m \u001B[0m\u001B[35m8 \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m\u001B[35m4 \u001B[0m\u001B[35m \u001B[0m │ │ \n", + " │ │ \u001B[36m \u001B[0m\u001B[36m \u001B[0m\u001B[36m \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m │ │ \u001B[36m \u001B[0m\u001B[36m \u001B[0m\u001B[36m \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m │ │ \n", " │ ╰────────────╯ ╰──────────────────────────────────────╯ │ \n", " ╰────────────────────────────────────────────────────────────────╯ \n" ] @@ -469,7 +469,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Downloading and preparing dataset imdb/plain_text to /home/ray/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...\n" + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Downloading and preparing dataset imdb/plain_text to /home/ray/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...\n" ] }, { @@ -645,7 +645,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Dataset imdb downloaded and prepared to /home/ray/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.\n" + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Dataset imdb downloaded and prepared to /home/ray/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.\n" ] }, { @@ -767,13 +767,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m len of train Dataset({\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m features: ['text', 'label', 'input_ids', 'attention_mask'],\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m num_rows: 100\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m }) and test Dataset({\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m features: ['text', 'label', 'input_ids', 'attention_mask'],\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m num_rows: 100\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m })\n" + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m len of train Dataset({\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m features: ['text', 'label', 'input_ids', 'attention_mask'],\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m num_rows: 100\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m }) and test Dataset({\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m features: ['text', 'label', 'input_ids', 'attention_mask'],\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m num_rows: 100\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m })\n" ] }, { @@ -787,54 +787,54 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m To disable this warning, you can either:\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \t- Avoid using `tokenizers` before the fork if possible\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m == Status ==\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Current time: 2022-11-04 07:55:58 (running for 00:00:05.07)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Memory usage on this node: 6.4/240.1 GiB\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Using FIFO scheduling algorithm.\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | Trial name | status | loc |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m |--------------------------------+----------+------------------|\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n" + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m To disable this warning, you can either:\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \t- Avoid using `tokenizers` before the fork if possible\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m == Status ==\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Current time: 2022-11-04 07:55:58 (running for 00:00:05.07)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Memory usage on this node: 6.4/240.1 GiB\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Using FIFO scheduling algorithm.\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Number of trials: 1/1 (1 RUNNING)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | Trial name | status | loc |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m |--------------------------------+----------+------------------|\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001b[0m 2022-11-04 07:56:02,047\tINFO torch.py:346 -- Setting up process group for: env:// [rank=0, world_size=4]\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=184, ip=10.129.66.16)\u001b[0m 2022-11-04 07:56:02,045\tINFO torch.py:346 -- Setting up process group for: env:// [rank=2, world_size=4]\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=183, ip=10.129.66.16)\u001b[0m 2022-11-04 07:56:02,047\tINFO torch.py:346 -- Setting up process group for: env:// [rank=1, world_size=4]\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=185, ip=10.129.66.16)\u001b[0m 2022-11-04 07:56:02,048\tINFO torch.py:346 -- Setting up process group for: env:// [rank=3, world_size=4]\n" + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001B[0m 2022-11-04 07:56:02,047\tINFO torch.py:346 -- Setting up process group for: env:// [rank=0, world_size=4]\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=184, ip=10.129.66.16)\u001B[0m 2022-11-04 07:56:02,045\tINFO torch.py:346 -- Setting up process group for: env:// [rank=2, world_size=4]\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=183, ip=10.129.66.16)\u001B[0m 2022-11-04 07:56:02,047\tINFO torch.py:346 -- Setting up process group for: env:// [rank=1, world_size=4]\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=185, ip=10.129.66.16)\u001B[0m 2022-11-04 07:56:02,048\tINFO torch.py:346 -- Setting up process group for: env:// [rank=3, world_size=4]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m == Status ==\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Current time: 2022-11-04 07:56:03 (running for 00:00:10.07)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Memory usage on this node: 7.2/240.1 GiB\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Using FIFO scheduling algorithm.\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | Trial name | status | loc |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m |--------------------------------+----------+------------------|\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n" + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m == Status ==\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Current time: 2022-11-04 07:56:03 (running for 00:00:10.07)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Memory usage on this node: 7.2/240.1 GiB\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Using FIFO scheduling algorithm.\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Number of trials: 1/1 (1 RUNNING)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | Trial name | status | loc |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m |--------------------------------+----------+------------------|\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n" ] }, { @@ -881,20 +881,20 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m == Status ==\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Current time: 2022-11-04 07:56:08 (running for 00:00:15.07)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Memory usage on this node: 7.5/240.1 GiB\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Using FIFO scheduling algorithm.\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | Trial name | status | loc |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m |--------------------------------+----------+------------------|\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n" + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m == Status ==\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Current time: 2022-11-04 07:56:08 (running for 00:00:15.07)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Memory usage on this node: 7.5/240.1 GiB\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Using FIFO scheduling algorithm.\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Number of trials: 1/1 (1 RUNNING)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | Trial name | status | loc |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m |--------------------------------+----------+------------------|\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n" ] }, { @@ -911,510 +911,510 @@ "Downloading: 95%|█████████▌| 255M/268M [00:04<00:00, 65.7MB/s]\n", "Downloading: 98%|█████████▊| 262M/268M [00:04<00:00, 65.8MB/s]\n", "Downloading: 100%|██████████| 268M/268M [00:04<00:00, 63.9MB/s]\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight']\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=184, ip=10.129.66.16)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight']\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=184, ip=10.129.66.16)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=184, ip=10.129.66.16)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=184, ip=10.129.66.16)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight']\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=184, ip=10.129.66.16)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=183, ip=10.129.66.16)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=183, ip=10.129.66.16)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=183, ip=10.129.66.16)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=183, ip=10.129.66.16)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight']\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=183, ip=10.129.66.16)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=185, ip=10.129.66.16)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias']\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=185, ip=10.129.66.16)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=185, ip=10.129.66.16)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=185, ip=10.129.66.16)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=185, ip=10.129.66.16)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001b[0m /tmp/ray/session_2022-11-04_07-51-23_507232_7/runtime_resources/pip/4a36d7bd0bbff8fccea52f9c0d942dd63707933f/virtualenv/lib/python3.8/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001b[0m warnings.warn(\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=183, ip=10.129.66.16)\u001b[0m /tmp/ray/session_2022-11-04_07-51-23_507232_7/runtime_resources/pip/4a36d7bd0bbff8fccea52f9c0d942dd63707933f/virtualenv/lib/python3.8/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=183, ip=10.129.66.16)\u001b[0m warnings.warn(\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=185, ip=10.129.66.16)\u001b[0m /tmp/ray/session_2022-11-04_07-51-23_507232_7/runtime_resources/pip/4a36d7bd0bbff8fccea52f9c0d942dd63707933f/virtualenv/lib/python3.8/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=185, ip=10.129.66.16)\u001b[0m warnings.warn(\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=184, ip=10.129.66.16)\u001b[0m /tmp/ray/session_2022-11-04_07-51-23_507232_7/runtime_resources/pip/4a36d7bd0bbff8fccea52f9c0d942dd63707933f/virtualenv/lib/python3.8/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=184, ip=10.129.66.16)\u001b[0m warnings.warn(\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001b[0m ***** Running training *****\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001b[0m Num examples = 6250\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001b[0m Num Epochs = 1\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001b[0m Instantaneous batch size per device = 16\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001b[0m Total train batch size (w. parallel, distributed & accumulation) = 64\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001b[0m Gradient Accumulation steps = 1\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001b[0m Total optimization steps = 391\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001b[0m Number of trainable parameters = 66955010\n" + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001B[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight']\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001B[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001B[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001B[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001B[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=184, ip=10.129.66.16)\u001B[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight']\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=184, ip=10.129.66.16)\u001B[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=184, ip=10.129.66.16)\u001B[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=184, ip=10.129.66.16)\u001B[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight']\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=184, ip=10.129.66.16)\u001B[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=183, ip=10.129.66.16)\u001B[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=183, ip=10.129.66.16)\u001B[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=183, ip=10.129.66.16)\u001B[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=183, ip=10.129.66.16)\u001B[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight']\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=183, ip=10.129.66.16)\u001B[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=185, ip=10.129.66.16)\u001B[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias']\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=185, ip=10.129.66.16)\u001B[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=185, ip=10.129.66.16)\u001B[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=185, ip=10.129.66.16)\u001B[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=185, ip=10.129.66.16)\u001B[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001B[0m /tmp/ray/session_2022-11-04_07-51-23_507232_7/runtime_resources/pip/4a36d7bd0bbff8fccea52f9c0d942dd63707933f/virtualenv/lib/python3.8/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001B[0m warnings.warn(\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=183, ip=10.129.66.16)\u001B[0m /tmp/ray/session_2022-11-04_07-51-23_507232_7/runtime_resources/pip/4a36d7bd0bbff8fccea52f9c0d942dd63707933f/virtualenv/lib/python3.8/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=183, ip=10.129.66.16)\u001B[0m warnings.warn(\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=185, ip=10.129.66.16)\u001B[0m /tmp/ray/session_2022-11-04_07-51-23_507232_7/runtime_resources/pip/4a36d7bd0bbff8fccea52f9c0d942dd63707933f/virtualenv/lib/python3.8/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=185, ip=10.129.66.16)\u001B[0m warnings.warn(\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=184, ip=10.129.66.16)\u001B[0m /tmp/ray/session_2022-11-04_07-51-23_507232_7/runtime_resources/pip/4a36d7bd0bbff8fccea52f9c0d942dd63707933f/virtualenv/lib/python3.8/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=184, ip=10.129.66.16)\u001B[0m warnings.warn(\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001B[0m ***** Running training *****\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001B[0m Num examples = 6250\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001B[0m Num Epochs = 1\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001B[0m Instantaneous batch size per device = 16\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001B[0m Total train batch size (w. parallel, distributed & accumulation) = 64\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001B[0m Gradient Accumulation steps = 1\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001B[0m Total optimization steps = 391\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001B[0m Number of trainable parameters = 66955010\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m == Status ==\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Current time: 2022-11-04 07:56:13 (running for 00:00:20.08)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Memory usage on this node: 12.3/240.1 GiB\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Using FIFO scheduling algorithm.\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | Trial name | status | loc |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m |--------------------------------+----------+------------------|\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n" + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m == Status ==\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Current time: 2022-11-04 07:56:13 (running for 00:00:20.08)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Memory usage on this node: 12.3/240.1 GiB\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Using FIFO scheduling algorithm.\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Number of trials: 1/1 (1 RUNNING)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | Trial name | status | loc |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m |--------------------------------+----------+------------------|\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001b[0m [W reducer.cpp:1251] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=184, ip=10.129.66.16)\u001b[0m [W reducer.cpp:1251] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=183, ip=10.129.66.16)\u001b[0m [W reducer.cpp:1251] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=185, ip=10.129.66.16)\u001b[0m [W reducer.cpp:1251] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())\n" + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001B[0m [W reducer.cpp:1251] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=184, ip=10.129.66.16)\u001B[0m [W reducer.cpp:1251] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=183, ip=10.129.66.16)\u001B[0m [W reducer.cpp:1251] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=185, ip=10.129.66.16)\u001B[0m [W reducer.cpp:1251] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m == Status ==\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Current time: 2022-11-04 07:56:18 (running for 00:00:25.08)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Memory usage on this node: 13.7/240.1 GiB\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Using FIFO scheduling algorithm.\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | Trial name | status | loc |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m |--------------------------------+----------+------------------|\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m == Status ==\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Current time: 2022-11-04 07:56:23 (running for 00:00:30.08)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Memory usage on this node: 13.7/240.1 GiB\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Using FIFO scheduling algorithm.\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | Trial name | status | loc |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m |--------------------------------+----------+------------------|\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m == Status ==\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Current time: 2022-11-04 07:56:28 (running for 00:00:35.09)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Memory usage on this node: 13.7/240.1 GiB\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Using FIFO scheduling algorithm.\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | Trial name | status | loc |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m |--------------------------------+----------+------------------|\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m == Status ==\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Current time: 2022-11-04 07:56:33 (running for 00:00:40.09)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Memory usage on this node: 13.7/240.1 GiB\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Using FIFO scheduling algorithm.\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | Trial name | status | loc |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m |--------------------------------+----------+------------------|\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m == Status ==\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Current time: 2022-11-04 07:56:38 (running for 00:00:45.10)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Memory usage on this node: 13.7/240.1 GiB\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Using FIFO scheduling algorithm.\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | Trial name | status | loc |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m |--------------------------------+----------+------------------|\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m == Status ==\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Current time: 2022-11-04 07:56:43 (running for 00:00:50.10)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Memory usage on this node: 13.7/240.1 GiB\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Using FIFO scheduling algorithm.\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | Trial name | status | loc |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m |--------------------------------+----------+------------------|\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m == Status ==\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Current time: 2022-11-04 07:56:48 (running for 00:00:55.10)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Memory usage on this node: 13.7/240.1 GiB\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Using FIFO scheduling algorithm.\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | Trial name | status | loc |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m |--------------------------------+----------+------------------|\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m == Status ==\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Current time: 2022-11-04 07:56:53 (running for 00:01:00.10)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Memory usage on this node: 13.7/240.1 GiB\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Using FIFO scheduling algorithm.\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | Trial name | status | loc |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m |--------------------------------+----------+------------------|\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m == Status ==\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Current time: 2022-11-04 07:56:59 (running for 00:01:05.11)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Memory usage on this node: 13.7/240.1 GiB\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Using FIFO scheduling algorithm.\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | Trial name | status | loc |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m |--------------------------------+----------+------------------|\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m == Status ==\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Current time: 2022-11-04 07:57:04 (running for 00:01:10.11)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Memory usage on this node: 13.7/240.1 GiB\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Using FIFO scheduling algorithm.\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | Trial name | status | loc |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m |--------------------------------+----------+------------------|\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m == Status ==\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Current time: 2022-11-04 07:57:09 (running for 00:01:15.11)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Memory usage on this node: 13.7/240.1 GiB\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Using FIFO scheduling algorithm.\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | Trial name | status | loc |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m |--------------------------------+----------+------------------|\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m == Status ==\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Current time: 2022-11-04 07:57:14 (running for 00:01:20.12)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Memory usage on this node: 13.7/240.1 GiB\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Using FIFO scheduling algorithm.\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | Trial name | status | loc |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m |--------------------------------+----------+------------------|\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m == Status ==\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Current time: 2022-11-04 07:57:19 (running for 00:01:25.12)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Memory usage on this node: 13.7/240.1 GiB\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Using FIFO scheduling algorithm.\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | Trial name | status | loc |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m |--------------------------------+----------+------------------|\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m == Status ==\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Current time: 2022-11-04 07:57:24 (running for 00:01:30.12)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Memory usage on this node: 13.7/240.1 GiB\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Using FIFO scheduling algorithm.\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | Trial name | status | loc |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m |--------------------------------+----------+------------------|\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m == Status ==\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Current time: 2022-11-04 07:57:29 (running for 00:01:35.13)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Memory usage on this node: 13.7/240.1 GiB\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Using FIFO scheduling algorithm.\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | Trial name | status | loc |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m |--------------------------------+----------+------------------|\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m == Status ==\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Current time: 2022-11-04 07:57:34 (running for 00:01:40.13)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Memory usage on this node: 13.7/240.1 GiB\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Using FIFO scheduling algorithm.\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | Trial name | status | loc |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m |--------------------------------+----------+------------------|\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m == Status ==\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Current time: 2022-11-04 07:57:39 (running for 00:01:45.13)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Memory usage on this node: 13.7/240.1 GiB\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Using FIFO scheduling algorithm.\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | Trial name | status | loc |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m |--------------------------------+----------+------------------|\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m == Status ==\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Current time: 2022-11-04 07:57:44 (running for 00:01:50.13)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Memory usage on this node: 13.7/240.1 GiB\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Using FIFO scheduling algorithm.\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | Trial name | status | loc |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m |--------------------------------+----------+------------------|\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m == Status ==\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Current time: 2022-11-04 07:57:49 (running for 00:01:55.14)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Memory usage on this node: 13.7/240.1 GiB\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Using FIFO scheduling algorithm.\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | Trial name | status | loc |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m |--------------------------------+----------+------------------|\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m == Status ==\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Current time: 2022-11-04 07:57:54 (running for 00:02:00.14)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Memory usage on this node: 13.7/240.1 GiB\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Using FIFO scheduling algorithm.\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | Trial name | status | loc |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m |--------------------------------+----------+------------------|\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m == Status ==\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Current time: 2022-11-04 07:57:59 (running for 00:02:05.15)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Memory usage on this node: 13.7/240.1 GiB\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Using FIFO scheduling algorithm.\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | Trial name | status | loc |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m |--------------------------------+----------+------------------|\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n" + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m == Status ==\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Current time: 2022-11-04 07:56:18 (running for 00:00:25.08)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Memory usage on this node: 13.7/240.1 GiB\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Using FIFO scheduling algorithm.\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Number of trials: 1/1 (1 RUNNING)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | Trial name | status | loc |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m |--------------------------------+----------+------------------|\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m == Status ==\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Current time: 2022-11-04 07:56:23 (running for 00:00:30.08)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Memory usage on this node: 13.7/240.1 GiB\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Using FIFO scheduling algorithm.\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Number of trials: 1/1 (1 RUNNING)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | Trial name | status | loc |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m |--------------------------------+----------+------------------|\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m == Status ==\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Current time: 2022-11-04 07:56:28 (running for 00:00:35.09)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Memory usage on this node: 13.7/240.1 GiB\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Using FIFO scheduling algorithm.\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Number of trials: 1/1 (1 RUNNING)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | Trial name | status | loc |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m |--------------------------------+----------+------------------|\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m == Status ==\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Current time: 2022-11-04 07:56:33 (running for 00:00:40.09)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Memory usage on this node: 13.7/240.1 GiB\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Using FIFO scheduling algorithm.\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Number of trials: 1/1 (1 RUNNING)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | Trial name | status | loc |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m |--------------------------------+----------+------------------|\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m == Status ==\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Current time: 2022-11-04 07:56:38 (running for 00:00:45.10)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Memory usage on this node: 13.7/240.1 GiB\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Using FIFO scheduling algorithm.\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Number of trials: 1/1 (1 RUNNING)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | Trial name | status | loc |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m |--------------------------------+----------+------------------|\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m == Status ==\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Current time: 2022-11-04 07:56:43 (running for 00:00:50.10)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Memory usage on this node: 13.7/240.1 GiB\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Using FIFO scheduling algorithm.\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Number of trials: 1/1 (1 RUNNING)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | Trial name | status | loc |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m |--------------------------------+----------+------------------|\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m == Status ==\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Current time: 2022-11-04 07:56:48 (running for 00:00:55.10)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Memory usage on this node: 13.7/240.1 GiB\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Using FIFO scheduling algorithm.\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Number of trials: 1/1 (1 RUNNING)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | Trial name | status | loc |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m |--------------------------------+----------+------------------|\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m == Status ==\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Current time: 2022-11-04 07:56:53 (running for 00:01:00.10)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Memory usage on this node: 13.7/240.1 GiB\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Using FIFO scheduling algorithm.\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Number of trials: 1/1 (1 RUNNING)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | Trial name | status | loc |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m |--------------------------------+----------+------------------|\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m == Status ==\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Current time: 2022-11-04 07:56:59 (running for 00:01:05.11)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Memory usage on this node: 13.7/240.1 GiB\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Using FIFO scheduling algorithm.\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Number of trials: 1/1 (1 RUNNING)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | Trial name | status | loc |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m |--------------------------------+----------+------------------|\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m == Status ==\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Current time: 2022-11-04 07:57:04 (running for 00:01:10.11)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Memory usage on this node: 13.7/240.1 GiB\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Using FIFO scheduling algorithm.\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Number of trials: 1/1 (1 RUNNING)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | Trial name | status | loc |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m |--------------------------------+----------+------------------|\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m == Status ==\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Current time: 2022-11-04 07:57:09 (running for 00:01:15.11)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Memory usage on this node: 13.7/240.1 GiB\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Using FIFO scheduling algorithm.\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Number of trials: 1/1 (1 RUNNING)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | Trial name | status | loc |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m |--------------------------------+----------+------------------|\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m == Status ==\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Current time: 2022-11-04 07:57:14 (running for 00:01:20.12)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Memory usage on this node: 13.7/240.1 GiB\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Using FIFO scheduling algorithm.\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Number of trials: 1/1 (1 RUNNING)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | Trial name | status | loc |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m |--------------------------------+----------+------------------|\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m == Status ==\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Current time: 2022-11-04 07:57:19 (running for 00:01:25.12)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Memory usage on this node: 13.7/240.1 GiB\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Using FIFO scheduling algorithm.\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Number of trials: 1/1 (1 RUNNING)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | Trial name | status | loc |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m |--------------------------------+----------+------------------|\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m == Status ==\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Current time: 2022-11-04 07:57:24 (running for 00:01:30.12)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Memory usage on this node: 13.7/240.1 GiB\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Using FIFO scheduling algorithm.\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Number of trials: 1/1 (1 RUNNING)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | Trial name | status | loc |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m |--------------------------------+----------+------------------|\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m == Status ==\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Current time: 2022-11-04 07:57:29 (running for 00:01:35.13)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Memory usage on this node: 13.7/240.1 GiB\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Using FIFO scheduling algorithm.\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Number of trials: 1/1 (1 RUNNING)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | Trial name | status | loc |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m |--------------------------------+----------+------------------|\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m == Status ==\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Current time: 2022-11-04 07:57:34 (running for 00:01:40.13)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Memory usage on this node: 13.7/240.1 GiB\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Using FIFO scheduling algorithm.\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Number of trials: 1/1 (1 RUNNING)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | Trial name | status | loc |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m |--------------------------------+----------+------------------|\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m == Status ==\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Current time: 2022-11-04 07:57:39 (running for 00:01:45.13)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Memory usage on this node: 13.7/240.1 GiB\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Using FIFO scheduling algorithm.\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Number of trials: 1/1 (1 RUNNING)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | Trial name | status | loc |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m |--------------------------------+----------+------------------|\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m == Status ==\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Current time: 2022-11-04 07:57:44 (running for 00:01:50.13)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Memory usage on this node: 13.7/240.1 GiB\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Using FIFO scheduling algorithm.\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Number of trials: 1/1 (1 RUNNING)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | Trial name | status | loc |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m |--------------------------------+----------+------------------|\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m == Status ==\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Current time: 2022-11-04 07:57:49 (running for 00:01:55.14)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Memory usage on this node: 13.7/240.1 GiB\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Using FIFO scheduling algorithm.\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Number of trials: 1/1 (1 RUNNING)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | Trial name | status | loc |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m |--------------------------------+----------+------------------|\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m == Status ==\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Current time: 2022-11-04 07:57:54 (running for 00:02:00.14)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Memory usage on this node: 13.7/240.1 GiB\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Using FIFO scheduling algorithm.\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Number of trials: 1/1 (1 RUNNING)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | Trial name | status | loc |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m |--------------------------------+----------+------------------|\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m == Status ==\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Current time: 2022-11-04 07:57:59 (running for 00:02:05.15)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Memory usage on this node: 13.7/240.1 GiB\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Using FIFO scheduling algorithm.\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Number of trials: 1/1 (1 RUNNING)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | Trial name | status | loc |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m |--------------------------------+----------+------------------|\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001b[0m Saving model checkpoint to /tmp/hf_imdb/test/checkpoint-391\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001b[0m Configuration saved in /tmp/hf_imdb/test/checkpoint-391/config.json\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001b[0m Model weights saved in /tmp/hf_imdb/test/checkpoint-391/pytorch_model.bin\n" + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001B[0m Saving model checkpoint to /tmp/hf_imdb/test/checkpoint-391\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001B[0m Configuration saved in /tmp/hf_imdb/test/checkpoint-391/config.json\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001B[0m Model weights saved in /tmp/hf_imdb/test/checkpoint-391/pytorch_model.bin\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Result for HuggingFaceTrainer_c7d60_00000:\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m _time_this_iter_s: 118.07144260406494\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m _timestamp: 1667573883\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m _training_iteration: 1\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m date: 2022-11-04_07-58-03\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m done: false\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m epoch: 1.0\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m experiment_id: 7bc6ab25d0414fcbb589bcb5d0f29b99\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m hostname: hfgputest-worker-small-group-hfgputest-q4758\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m iterations_since_restore: 1\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m node_ip: 10.129.66.16\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m pid: 146\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m should_checkpoint: true\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m step: 391\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m time_since_restore: 124.55581378936768\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m time_this_iter_s: 124.55581378936768\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m time_total_s: 124.55581378936768\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m timestamp: 1667573883\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m timesteps_since_restore: 0\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m train_loss: 0.2760564701636429\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m train_runtime: 109.7668\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m train_samples_per_second: 56.939\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m train_steps_per_second: 3.562\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m training_iteration: 1\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m trial_id: c7d60_00000\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m warmup_time: 0.003995656967163086\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001b[0m {'train_runtime': 109.7668, 'train_samples_per_second': 56.939, 'train_steps_per_second': 3.562, 'train_loss': 0.2760564701636429, 'epoch': 1.0}\n" + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Result for HuggingFaceTrainer_c7d60_00000:\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m _time_this_iter_s: 118.07144260406494\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m _timestamp: 1667573883\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m _training_iteration: 1\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m date: 2022-11-04_07-58-03\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m done: false\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m epoch: 1.0\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m experiment_id: 7bc6ab25d0414fcbb589bcb5d0f29b99\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m hostname: hfgputest-worker-small-group-hfgputest-q4758\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m iterations_since_restore: 1\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m node_ip: 10.129.66.16\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m pid: 146\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m should_checkpoint: true\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m step: 391\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m time_since_restore: 124.55581378936768\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m time_this_iter_s: 124.55581378936768\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m time_total_s: 124.55581378936768\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m timestamp: 1667573883\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m timesteps_since_restore: 0\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m train_loss: 0.2760564701636429\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m train_runtime: 109.7668\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m train_samples_per_second: 56.939\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m train_steps_per_second: 3.562\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m training_iteration: 1\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m trial_id: c7d60_00000\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m warmup_time: 0.003995656967163086\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001B[0m {'train_runtime': 109.7668, 'train_samples_per_second': 56.939, 'train_steps_per_second': 3.562, 'train_loss': 0.2760564701636429, 'epoch': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001b[0m \n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001b[0m \n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001b[0m Training completed. Do not forget to share your model on huggingface.co/models =)\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001b[0m \n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001b[0m \n" + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001B[0m \n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001B[0m \n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001B[0m Training completed. Do not forget to share your model on huggingface.co/models =)\n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001B[0m \n", + "\u001B[2m\u001B[36m(BaseWorkerMixin pid=182, ip=10.129.66.16)\u001B[0m \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m == Status ==\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Current time: 2022-11-04 07:58:13 (running for 00:02:19.36)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Memory usage on this node: 16.0/240.1 GiB\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Using FIFO scheduling algorithm.\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+--------+------------------+-----------------+----------------------------+--------------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | Trial name | status | loc | iter | total time (s) | train_runtime | train_samples_per_second | train_steps_per_second |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m |--------------------------------+----------+------------------+--------+------------------+-----------------+----------------------------+--------------------------|\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 | 1 | 124.556 | 109.767 | 56.939 | 3.562 |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+----------+------------------+--------+------------------+-----------------+----------------------------+--------------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n" + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m == Status ==\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Current time: 2022-11-04 07:58:13 (running for 00:02:19.36)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Memory usage on this node: 16.0/240.1 GiB\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Using FIFO scheduling algorithm.\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Resources requested: 5.0/10 CPUs, 4.0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Number of trials: 1/1 (1 RUNNING)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+--------+------------------+-----------------+----------------------------+--------------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | Trial name | status | loc | iter | total time (s) | train_runtime | train_samples_per_second | train_steps_per_second |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m |--------------------------------+----------+------------------+--------+------------------+-----------------+----------------------------+--------------------------|\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | HuggingFaceTrainer_c7d60_00000 | RUNNING | 10.129.66.16:146 | 1 | 124.556 | 109.767 | 56.939 | 3.562 |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+----------+------------------+--------+------------------+-----------------+----------------------------+--------------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m 2022-11-04 07:58:13,248\tWARNING util.py:214 -- The `process_trial_save` operation took 9.709 s, which may be a performance bottleneck.\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m 2022-11-04 07:58:13,248\tWARNING trial_runner.py:856 -- Consider turning off forced head-worker trial checkpoint syncs by setting sync_on_checkpoint=False. Note that this may result in faulty trial restoration if a failure occurs while the checkpoint is being synced from the worker to the head node.\n" + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m 2022-11-04 07:58:13,248\tWARNING util.py:214 -- The `process_trial_save` operation took 9.709 s, which may be a performance bottleneck.\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m 2022-11-04 07:58:13,248\tWARNING trial_runner.py:856 -- Consider turning off forced head-worker trial checkpoint syncs by setting sync_on_checkpoint=False. Note that this may result in faulty trial restoration if a failure occurs while the checkpoint is being synced from the worker to the head node.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Result for HuggingFaceTrainer_c7d60_00000:\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m _time_this_iter_s: 118.07144260406494\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m _timestamp: 1667573883\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m _training_iteration: 1\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m date: 2022-11-04_07-58-03\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m done: true\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m epoch: 1.0\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m experiment_id: 7bc6ab25d0414fcbb589bcb5d0f29b99\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m experiment_tag: '0'\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m hostname: hfgputest-worker-small-group-hfgputest-q4758\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m iterations_since_restore: 1\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m node_ip: 10.129.66.16\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m pid: 146\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m should_checkpoint: true\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m step: 391\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m time_since_restore: 124.55581378936768\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m time_this_iter_s: 124.55581378936768\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m time_total_s: 124.55581378936768\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m timestamp: 1667573883\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m timesteps_since_restore: 0\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m train_loss: 0.2760564701636429\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m train_runtime: 109.7668\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m train_samples_per_second: 56.939\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m train_steps_per_second: 3.562\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m training_iteration: 1\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m trial_id: c7d60_00000\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m warmup_time: 0.003995656967163086\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m == Status ==\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Current time: 2022-11-04 07:58:16 (running for 00:02:22.40)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Memory usage on this node: 9.1/240.1 GiB\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Using FIFO scheduling algorithm.\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Resources requested: 0/10 CPUs, 0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m Number of trials: 1/1 (1 TERMINATED)\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+------------+------------------+--------+------------------+-----------------+----------------------------+--------------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | Trial name | status | loc | iter | total time (s) | train_runtime | train_samples_per_second | train_steps_per_second |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m |--------------------------------+------------+------------------+--------+------------------+-----------------+----------------------------+--------------------------|\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m | HuggingFaceTrainer_c7d60_00000 | TERMINATED | 10.129.66.16:146 | 1 | 124.556 | 109.767 | 56.939 | 3.562 |\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m +--------------------------------+------------+------------------+--------+------------------+-----------------+----------------------------+--------------------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m \n" + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Result for HuggingFaceTrainer_c7d60_00000:\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m _time_this_iter_s: 118.07144260406494\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m _timestamp: 1667573883\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m _training_iteration: 1\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m date: 2022-11-04_07-58-03\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m done: true\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m epoch: 1.0\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m experiment_id: 7bc6ab25d0414fcbb589bcb5d0f29b99\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m experiment_tag: '0'\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m hostname: hfgputest-worker-small-group-hfgputest-q4758\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m iterations_since_restore: 1\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m node_ip: 10.129.66.16\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m pid: 146\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m should_checkpoint: true\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m step: 391\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m time_since_restore: 124.55581378936768\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m time_this_iter_s: 124.55581378936768\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m time_total_s: 124.55581378936768\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m timestamp: 1667573883\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m timesteps_since_restore: 0\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m train_loss: 0.2760564701636429\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m train_runtime: 109.7668\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m train_samples_per_second: 56.939\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m train_steps_per_second: 3.562\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m training_iteration: 1\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m trial_id: c7d60_00000\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m warmup_time: 0.003995656967163086\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m == Status ==\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Current time: 2022-11-04 07:58:16 (running for 00:02:22.40)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Memory usage on this node: 9.1/240.1 GiB\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Using FIFO scheduling algorithm.\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Resources requested: 0/10 CPUs, 0/4 GPUs, 0.0/22.35 GiB heap, 0.0/6.59 GiB objects\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-11-04_07-55-53\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m Number of trials: 1/1 (1 TERMINATED)\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+------------+------------------+--------+------------------+-----------------+----------------------------+--------------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | Trial name | status | loc | iter | total time (s) | train_runtime | train_samples_per_second | train_steps_per_second |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m |--------------------------------+------------+------------------+--------+------------------+-----------------+----------------------------+--------------------------|\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m | HuggingFaceTrainer_c7d60_00000 | TERMINATED | 10.129.66.16:146 | 1 | 124.556 | 109.767 | 56.939 | 3.562 |\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m +--------------------------------+------------+------------------+--------+------------------+-----------------+----------------------------+--------------------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m 2022-11-04 07:58:16,286\tWARNING util.py:214 -- The `process_trial_save` operation took 2.161 s, which may be a performance bottleneck.\n", - "\u001b[2m\u001b[36m(train_fn pid=250)\u001b[0m 2022-11-04 07:58:16,398\tINFO tune.py:747 -- Total run time: 142.70 seconds (142.40 seconds for the tuning loop).\n" + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m 2022-11-04 07:58:16,286\tWARNING util.py:214 -- The `process_trial_save` operation took 2.161 s, which may be a performance bottleneck.\n", + "\u001B[2m\u001B[36m(train_fn pid=250)\u001B[0m 2022-11-04 07:58:16,398\tINFO tune.py:747 -- Total run time: 142.70 seconds (142.40 seconds for the tuning loop).\n" ] } ], diff --git a/demo-notebooks/additional-demos/local_interactive.ipynb b/demo-notebooks/additional-demos/local_interactive.ipynb index 74ac4055..1d3c83ad 100644 --- a/demo-notebooks/additional-demos/local_interactive.ipynb +++ b/demo-notebooks/additional-demos/local_interactive.ipynb @@ -62,7 +62,7 @@ " max_cpus=1,\n", " min_memory=4,\n", " max_memory=4,\n", - " image=\"quay.io/project-codeflare/ray:2.20.0-py39-cu118\",\n", + " image=\"quay.io/rhoai/ray:2.23.0-py39-cu121\",\n", " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n", " # local_queue=\"local-queue-name\" # Specify the local queue manually\n", " ))" diff --git a/demo-notebooks/additional-demos/ray_job_client.ipynb b/demo-notebooks/additional-demos/ray_job_client.ipynb index c58a4d73..c452fb31 100644 --- a/demo-notebooks/additional-demos/ray_job_client.ipynb +++ b/demo-notebooks/additional-demos/ray_job_client.ipynb @@ -53,7 +53,7 @@ " max_cpus=1,\n", " min_memory=4,\n", " max_memory=4,\n", - " image=\"quay.io/project-codeflare/ray:2.20.0-py39-cu118\",\n", + " image=\"quay.io/rhoai/ray:2.23.0-py39-cu121\",\n", " write_to_file=False # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n", "))" ] diff --git a/demo-notebooks/guided-demos/0_basic_ray.ipynb b/demo-notebooks/guided-demos/0_basic_ray.ipynb index fd0eea26..0cd3419f 100644 --- a/demo-notebooks/guided-demos/0_basic_ray.ipynb +++ b/demo-notebooks/guided-demos/0_basic_ray.ipynb @@ -70,7 +70,7 @@ " max_cpus=1,\n", " min_memory=4,\n", " max_memory=4,\n", - " image=\"quay.io/project-codeflare/ray:2.20.0-py39-cu118\",\n", + " image=\"quay.io/rhoai/ray:2.23.0-py39-cu121\",\n", " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n", " # local_queue=\"local-queue-name\" # Specify the local queue manually\n", "))" diff --git a/demo-notebooks/guided-demos/1_cluster_job_client.ipynb b/demo-notebooks/guided-demos/1_cluster_job_client.ipynb index f32f6046..de8fafdd 100644 --- a/demo-notebooks/guided-demos/1_cluster_job_client.ipynb +++ b/demo-notebooks/guided-demos/1_cluster_job_client.ipynb @@ -52,7 +52,7 @@ " max_cpus=1,\n", " min_memory=4,\n", " max_memory=4,\n", - " image=\"quay.io/project-codeflare/ray:2.20.0-py39-cu118\",\n", + " image=\"quay.io/rhoai/ray:2.23.0-py39-cu121\",\n", " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n", " # local_queue=\"local-queue-name\" # Specify the local queue manually\n", "))" diff --git a/demo-notebooks/guided-demos/2_basic_interactive.ipynb b/demo-notebooks/guided-demos/2_basic_interactive.ipynb index 7b89e8f0..b6a13b8c 100644 --- a/demo-notebooks/guided-demos/2_basic_interactive.ipynb +++ b/demo-notebooks/guided-demos/2_basic_interactive.ipynb @@ -69,7 +69,7 @@ " max_cpus=2,\n", " min_memory=8,\n", " max_memory=8,\n", - " image=\"quay.io/project-codeflare/ray:2.20.0-py39-cu118\",\n", + " image=\"quay.io/rhoai/ray:2.23.0-py39-cu121\",\n", " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n", " # local_queue=\"local-queue-name\" # Specify the local queue manually\n", "))" diff --git a/demo-notebooks/guided-demos/notebook-ex-outputs/0_basic_ray.ipynb b/demo-notebooks/guided-demos/notebook-ex-outputs/0_basic_ray.ipynb index ed21f46d..646e2424 100644 --- a/demo-notebooks/guided-demos/notebook-ex-outputs/0_basic_ray.ipynb +++ b/demo-notebooks/guided-demos/notebook-ex-outputs/0_basic_ray.ipynb @@ -78,7 +78,7 @@ " max_cpus=1,\n", " min_memory=4,\n", " max_memory=4,\n", - " image=\"quay.io/project-codeflare/ray:2.20.0-py39-cu118\",\n", + " image=\"quay.io/rhoai/ray:2.23.0-py39-cu121\",\n", " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n", " # local_queue=\"local-queue-name\" # Specify the local queue manually\n", "))" @@ -134,13 +134,13 @@ ], "text/plain": [ "╭───────────────────────╮\n", - "│ \u001b[3m \u001b[0m\u001b[1;3m 🚀 Cluster Queue\u001b[0m\u001b[3m \u001b[0m │\n", - "│ \u001b[3m \u001b[0m\u001b[1;3mStatus 🚀\u001b[0m\u001b[3m \u001b[0m │\n", + "│ \u001B[3m \u001B[0m\u001B[1;3m 🚀 Cluster Queue\u001B[0m\u001B[3m \u001B[0m │\n", + "│ \u001B[3m \u001B[0m\u001B[1;3mStatus 🚀\u001B[0m\u001B[3m \u001B[0m │\n", "│ +---------+---------+ │\n", - "│ |\u001b[1m \u001b[0m\u001b[1mName \u001b[0m\u001b[1m \u001b[0m|\u001b[1m \u001b[0m\u001b[1mStatus \u001b[0m\u001b[1m \u001b[0m| │\n", + "│ |\u001B[1m \u001B[0m\u001B[1mName \u001B[0m\u001B[1m \u001B[0m|\u001B[1m \u001B[0m\u001B[1mStatus \u001B[0m\u001B[1m \u001B[0m| │\n", "│ +=========+=========+ │\n", - "│ |\u001b[36m \u001b[0m\u001b[36mraytest\u001b[0m\u001b[36m \u001b[0m|\u001b[35m \u001b[0m\u001b[35mpending\u001b[0m\u001b[35m \u001b[0m| │\n", - "│ |\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m|\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m| │\n", + "│ |\u001B[36m \u001B[0m\u001B[36mraytest\u001B[0m\u001B[36m \u001B[0m|\u001B[35m \u001B[0m\u001B[35mpending\u001B[0m\u001B[35m \u001B[0m| │\n", + "│ |\u001B[36m \u001B[0m\u001B[36m \u001B[0m\u001B[36m \u001B[0m|\u001B[35m \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m| │\n", "│ +---------+---------+ │\n", "╰───────────────────────╯\n" ] @@ -205,15 +205,15 @@ "\n" ], "text/plain": [ - "\u001b[3m \u001b[0m\u001b[1;3m 🚀 CodeFlare Cluster Status 🚀\u001b[0m\u001b[3m \u001b[0m\n", - "\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\n", + "\u001B[3m \u001B[0m\u001B[1;3m 🚀 CodeFlare Cluster Status 🚀\u001B[0m\u001B[3m \u001B[0m\n", + "\u001B[1m \u001B[0m\u001B[1m \u001B[0m\u001B[1m \u001B[0m\n", " ╭──────────────────────────────────────────────────────────────╮ \n", - " │ \u001b[1;37;42mName\u001b[0m │ \n", - " │ \u001b[1;4mraytest\u001b[0m Active ✅ │ \n", + " │ \u001B[1;37;42mName\u001B[0m │ \n", + " │ \u001B[1;4mraytest\u001B[0m Active ✅ │ \n", " │ │ \n", - " │ \u001b[1mURI:\u001b[0m ray://raytest-head-svc.default.svc:10001 │ \n", + " │ \u001B[1mURI:\u001B[0m ray://raytest-head-svc.default.svc:10001 │ \n", " │ │ \n", - " │ \u001b]8;id=630217;ray-dashboard-raytest-default.apps.meyceoz-07122023.psap.aws.rhperfscale.org\u001b\\\u001b[4;34mDashboard🔗\u001b[0m\u001b]8;;\u001b\\ │ \n", + " │ \u001B]8;id=630217;ray-dashboard-raytest-default.apps.meyceoz-07122023.psap.aws.rhperfscale.org\u001B\\\u001B[4;34mDashboard🔗\u001B[0m\u001B]8;;\u001B\\ │ \n", " │ │ \n", " ╰──────────────────────────────────────────────────────────────╯ \n" ] @@ -274,22 +274,22 @@ "\n" ], "text/plain": [ - "\u001b[3m \u001b[0m\u001b[1;3m 🚀 CodeFlare Cluster Details 🚀\u001b[0m\u001b[3m \u001b[0m\n", - "\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\n", + "\u001B[3m \u001B[0m\u001B[1;3m 🚀 CodeFlare Cluster Details 🚀\u001B[0m\u001B[3m \u001B[0m\n", + "\u001B[1m \u001B[0m\u001B[1m \u001B[0m\u001B[1m \u001B[0m\n", " ╭───────────────────────────────────────────────────────────────╮ \n", - " │ \u001b[1;37;42mName\u001b[0m │ \n", - " │ \u001b[1;4mraytest\u001b[0m Active ✅ │ \n", + " │ \u001B[1;37;42mName\u001B[0m │ \n", + " │ \u001B[1;4mraytest\u001B[0m Active ✅ │ \n", " │ │ \n", - " │ \u001b[1mURI:\u001b[0m ray://raytest-head-svc.default.svc:10001 │ \n", + " │ \u001B[1mURI:\u001B[0m ray://raytest-head-svc.default.svc:10001 │ \n", " │ │ \n", - " │ \u001b]8;id=623965;http://ray-dashboard-raytest-default.apps.meyceoz-07122023.psap.aws.rhperfscale.org\u001b\\\u001b[4;34mDashboard🔗\u001b[0m\u001b]8;;\u001b\\ │ \n", + " │ \u001B]8;id=623965;http://ray-dashboard-raytest-default.apps.meyceoz-07122023.psap.aws.rhperfscale.org\u001B\\\u001B[4;34mDashboard🔗\u001B[0m\u001B]8;;\u001B\\ │ \n", " │ │ \n", - " │ \u001b[3m Cluster Resources \u001b[0m │ \n", + " │ \u001B[3m Cluster Resources \u001B[0m │ \n", " │ ╭── Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │ \n", - " │ │ \u001b[1m \u001b[0m\u001b[1m# Workers\u001b[0m\u001b[1m \u001b[0m │ │ \u001b[1m \u001b[0m\u001b[1mMemory \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mCPU \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mGPU \u001b[0m\u001b[1m \u001b[0m │ │ \n", - " │ │ \u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m │ │ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m │ │ \n", - " │ │ \u001b[35m \u001b[0m\u001b[35m2 \u001b[0m\u001b[35m \u001b[0m │ │ \u001b[36m \u001b[0m\u001b[36m4~4 \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m1 \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m0 \u001b[0m\u001b[35m \u001b[0m │ │ \n", - " │ │ \u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m │ │ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m │ │ \n", + " │ │ \u001B[1m \u001B[0m\u001B[1m# Workers\u001B[0m\u001B[1m \u001B[0m │ │ \u001B[1m \u001B[0m\u001B[1mMemory \u001B[0m\u001B[1m \u001B[0m\u001B[1m \u001B[0m\u001B[1mCPU \u001B[0m\u001B[1m \u001B[0m\u001B[1m \u001B[0m\u001B[1mGPU \u001B[0m\u001B[1m \u001B[0m │ │ \n", + " │ │ \u001B[35m \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m │ │ \u001B[36m \u001B[0m\u001B[36m \u001B[0m\u001B[36m \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m │ │ \n", + " │ │ \u001B[35m \u001B[0m\u001B[35m2 \u001B[0m\u001B[35m \u001B[0m │ │ \u001B[36m \u001B[0m\u001B[36m4~4 \u001B[0m\u001B[36m \u001B[0m\u001B[35m \u001B[0m\u001B[35m1 \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m\u001B[35m0 \u001B[0m\u001B[35m \u001B[0m │ │ \n", + " │ │ \u001B[35m \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m │ │ \u001B[36m \u001B[0m\u001B[36m \u001B[0m\u001B[36m \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m │ │ \n", " │ ╰─────────────╯ ╰──────────────────────────────────────╯ │ \n", " ╰───────────────────────────────────────────────────────────────╯ \n" ] diff --git a/demo-notebooks/guided-demos/notebook-ex-outputs/1_cluster_job_client.ipynb b/demo-notebooks/guided-demos/notebook-ex-outputs/1_cluster_job_client.ipynb index 7deb6633..b2e954e2 100644 --- a/demo-notebooks/guided-demos/notebook-ex-outputs/1_cluster_job_client.ipynb +++ b/demo-notebooks/guided-demos/notebook-ex-outputs/1_cluster_job_client.ipynb @@ -52,7 +52,7 @@ " max_cpus=1,\n", " min_memory=4,\n", " max_memory=4,\n", - " image=\"quay.io/project-codeflare/ray:2.20.0-py39-cu118\",\n", + " image=\"quay.io/rhoai/ray:2.23.0-py39-cu121\",\n", " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n", " # local_queue=\"local-queue-name\" # Specify the local queue manually\n", "))" diff --git a/demo-notebooks/guided-demos/notebook-ex-outputs/2_basic_interactive.ipynb b/demo-notebooks/guided-demos/notebook-ex-outputs/2_basic_interactive.ipynb index 5e499b6b..443ea063 100644 --- a/demo-notebooks/guided-demos/notebook-ex-outputs/2_basic_interactive.ipynb +++ b/demo-notebooks/guided-demos/notebook-ex-outputs/2_basic_interactive.ipynb @@ -77,7 +77,7 @@ " max_cpus=2,\n", " min_memory=8,\n", " max_memory=8,\n", - " image=\"quay.io/project-codeflare/ray:2.20.0-py39-cu118\",\n", + " image=\"quay.io/rhoai/ray:2.23.0-py39-cu121\",\n", " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n", " # local_queue=\"local-queue-name\" # Specify the local queue manually\n", "))" @@ -134,22 +134,22 @@ "\n" ], "text/plain": [ - "\u001b[3m \u001b[0m\u001b[1;3m 🚀 CodeFlare Cluster Details 🚀\u001b[0m\u001b[3m \u001b[0m\n", - "\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\n", + "\u001B[3m \u001B[0m\u001B[1;3m 🚀 CodeFlare Cluster Details 🚀\u001B[0m\u001B[3m \u001B[0m\n", + "\u001B[1m \u001B[0m\u001B[1m \u001B[0m\u001B[1m \u001B[0m\n", " ╭──────────────────────────────────────────────────────────────────────╮ \n", - " │ \u001b[1;37;42mName\u001b[0m │ \n", - " │ \u001b[1;4minteractivetest\u001b[0m Active ✅ │ \n", + " │ \u001B[1;37;42mName\u001B[0m │ \n", + " │ \u001B[1;4minteractivetest\u001B[0m Active ✅ │ \n", " │ │ \n", - " │ \u001b[1mURI:\u001b[0m ray://interactivetest-head-svc.default.svc:10001 │ \n", + " │ \u001B[1mURI:\u001B[0m ray://interactivetest-head-svc.default.svc:10001 │ \n", " │ │ \n", - " │ \u001b]8;id=970589;http://ray-dashboard-interactivetest-default.apps.meyceoz-07122023.psap.aws.rhperfscale.org\u001b\\\u001b[4;34mDashboard🔗\u001b[0m\u001b]8;;\u001b\\ │ \n", + " │ \u001B]8;id=970589;http://ray-dashboard-interactivetest-default.apps.meyceoz-07122023.psap.aws.rhperfscale.org\u001B\\\u001B[4;34mDashboard🔗\u001B[0m\u001B]8;;\u001B\\ │ \n", " │ │ \n", - " │ \u001b[3m Cluster Resources \u001b[0m │ \n", + " │ \u001B[3m Cluster Resources \u001B[0m │ \n", " │ ╭── Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │ \n", - " │ │ \u001b[1m \u001b[0m\u001b[1m# Workers\u001b[0m\u001b[1m \u001b[0m │ │ \u001b[1m \u001b[0m\u001b[1mMemory \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mCPU \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mGPU \u001b[0m\u001b[1m \u001b[0m │ │ \n", - " │ │ \u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m │ │ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m │ │ \n", - " │ │ \u001b[35m \u001b[0m\u001b[35m2 \u001b[0m\u001b[35m \u001b[0m │ │ \u001b[36m \u001b[0m\u001b[36m8~8 \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m2 \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m1 \u001b[0m\u001b[35m \u001b[0m │ │ \n", - " │ │ \u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m │ │ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m │ │ \n", + " │ │ \u001B[1m \u001B[0m\u001B[1m# Workers\u001B[0m\u001B[1m \u001B[0m │ │ \u001B[1m \u001B[0m\u001B[1mMemory \u001B[0m\u001B[1m \u001B[0m\u001B[1m \u001B[0m\u001B[1mCPU \u001B[0m\u001B[1m \u001B[0m\u001B[1m \u001B[0m\u001B[1mGPU \u001B[0m\u001B[1m \u001B[0m │ │ \n", + " │ │ \u001B[35m \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m │ │ \u001B[36m \u001B[0m\u001B[36m \u001B[0m\u001B[36m \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m │ │ \n", + " │ │ \u001B[35m \u001B[0m\u001B[35m2 \u001B[0m\u001B[35m \u001B[0m │ │ \u001B[36m \u001B[0m\u001B[36m8~8 \u001B[0m\u001B[36m \u001B[0m\u001B[35m \u001B[0m\u001B[35m2 \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m\u001B[35m1 \u001B[0m\u001B[35m \u001B[0m │ │ \n", + " │ │ \u001B[35m \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m │ │ \u001B[36m \u001B[0m\u001B[36m \u001B[0m\u001B[36m \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m\u001B[35m \u001B[0m │ │ \n", " │ ╰─────────────╯ ╰──────────────────────────────────────╯ │ \n", " ╰──────────────────────────────────────────────────────────────────────╯ \n" ] @@ -381,7 +381,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m Downloading and preparing dataset imdb/plain_text to /home/ray/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0...\n" + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m Downloading and preparing dataset imdb/plain_text to /home/ray/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0...\n" ] }, { @@ -494,7 +494,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m Dataset imdb downloaded and prepared to /home/ray/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0. Subsequent calls will reuse this data.\n" + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m Dataset imdb downloaded and prepared to /home/ray/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0. Subsequent calls will reuse this data.\n" ] }, { @@ -613,13 +613,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m len of train Dataset({\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m features: ['text', 'label', 'input_ids', 'attention_mask'],\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m num_rows: 100\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m }) and test Dataset({\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m features: ['text', 'label', 'input_ids', 'attention_mask'],\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m num_rows: 100\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m })\n" + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m len of train Dataset({\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m features: ['text', 'label', 'input_ids', 'attention_mask'],\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m num_rows: 100\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m }) and test Dataset({\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m features: ['text', 'label', 'input_ids', 'attention_mask'],\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m num_rows: 100\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m })\n" ] }, { @@ -627,106 +627,106 @@ "output_type": "stream", "text": [ " \n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m 2023-08-09 14:51:50,865\tWARNING dataset.py:253 -- \u001b[33mImportant: Ray Data requires schemas for all datasets in Ray 2.5. This means that standalone Python objects are no longer supported. In addition, the default batch format is fixed to NumPy. To revert to legacy behavior temporarily, set the environment variable RAY_DATA_STRICT_MODE=0 on all cluster processes.\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m Learn more here: https://docs.ray.io/en/master/data/faq.html#migrating-to-strict-mode\u001b[0m\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m /tmp/ipykernel_265/307576807.py:57: DeprecationWarning: `HuggingFaceTrainer`, `HuggingFacePredictor` and `HuggingFaceCheckpoint` have been renamed to `TransformersTrainer`, `TransformersPredictor` and `TransformersCheckpoint` respectively. Update your code to use the new import paths. This will raise an exception in the future.\n" + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m 2023-08-09 14:51:50,865\tWARNING dataset.py:253 -- \u001B[33mImportant: Ray Data requires schemas for all datasets in Ray 2.5. This means that standalone Python objects are no longer supported. In addition, the default batch format is fixed to NumPy. To revert to legacy behavior temporarily, set the environment variable RAY_DATA_STRICT_MODE=0 on all cluster processes.\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m Learn more here: https://docs.ray.io/en/master/data/faq.html#migrating-to-strict-mode\u001B[0m\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m /tmp/ipykernel_265/307576807.py:57: DeprecationWarning: `HuggingFaceTrainer`, `HuggingFacePredictor` and `HuggingFaceCheckpoint` have been renamed to `TransformersTrainer`, `TransformersPredictor` and `TransformersCheckpoint` respectively. Update your code to use the new import paths. This will raise an exception in the future.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m To disable this warning, you can either:\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m \t- Avoid using `tokenizers` before the fork if possible\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m \t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m == Status ==\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m Current time: 2023-08-09 14:51:51 (running for 00:00:00.12)\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m Using FIFO scheduling algorithm.\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m Logical resource usage: 0/6 CPUs, 0/2 GPUs\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-08-09_14-51-51\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m Number of trials: 1/1 (1 PENDING)\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m +--------------------------------+----------+-------+\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m | Trial name | status | loc |\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m |--------------------------------+----------+-------|\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m | HuggingFaceTrainer_f2621_00000 | PENDING | |\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m +--------------------------------+----------+-------+\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m \n" + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m To disable this warning, you can either:\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m \t- Avoid using `tokenizers` before the fork if possible\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m \t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m == Status ==\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m Current time: 2023-08-09 14:51:51 (running for 00:00:00.12)\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m Using FIFO scheduling algorithm.\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m Logical resource usage: 0/6 CPUs, 0/2 GPUs\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-08-09_14-51-51\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m Number of trials: 1/1 (1 PENDING)\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m +--------------------------------+----------+-------+\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m | Trial name | status | loc |\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m |--------------------------------+----------+-------|\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m | HuggingFaceTrainer_f2621_00000 | PENDING | |\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m +--------------------------------+----------+-------+\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(HuggingFaceTrainer pid=196, ip=10.130.4.19)\u001b[0m 2023-08-09 14:51:55,978\tWARNING dataset.py:253 -- \u001b[33mImportant: Ray Data requires schemas for all datasets in Ray 2.5. This means that standalone Python objects are no longer supported. In addition, the default batch format is fixed to NumPy. To revert to legacy behavior temporarily, set the environment variable RAY_DATA_STRICT_MODE=0 on all cluster processes.\n", - "\u001b[2m\u001b[36m(HuggingFaceTrainer pid=196, ip=10.130.4.19)\u001b[0m \n", - "\u001b[2m\u001b[36m(HuggingFaceTrainer pid=196, ip=10.130.4.19)\u001b[0m Learn more here: https://docs.ray.io/en/master/data/faq.html#migrating-to-strict-mode\u001b[0m\n" + "\u001B[2m\u001B[36m(HuggingFaceTrainer pid=196, ip=10.130.4.19)\u001B[0m 2023-08-09 14:51:55,978\tWARNING dataset.py:253 -- \u001B[33mImportant: Ray Data requires schemas for all datasets in Ray 2.5. This means that standalone Python objects are no longer supported. In addition, the default batch format is fixed to NumPy. To revert to legacy behavior temporarily, set the environment variable RAY_DATA_STRICT_MODE=0 on all cluster processes.\n", + "\u001B[2m\u001B[36m(HuggingFaceTrainer pid=196, ip=10.130.4.19)\u001B[0m \n", + "\u001B[2m\u001B[36m(HuggingFaceTrainer pid=196, ip=10.130.4.19)\u001B[0m Learn more here: https://docs.ray.io/en/master/data/faq.html#migrating-to-strict-mode\u001B[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m == Status ==\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m Current time: 2023-08-09 14:51:56 (running for 00:00:05.16)\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m Using FIFO scheduling algorithm.\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m Logical resource usage: 1.0/6 CPUs, 2.0/2 GPUs\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-08-09_14-51-51\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m +--------------------------------+----------+-----------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m | Trial name | status | loc |\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m |--------------------------------+----------+-----------------|\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m | HuggingFaceTrainer_f2621_00000 | RUNNING | 10.130.4.19:196 |\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m +--------------------------------+----------+-----------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m \n" + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m == Status ==\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m Current time: 2023-08-09 14:51:56 (running for 00:00:05.16)\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m Using FIFO scheduling algorithm.\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m Logical resource usage: 1.0/6 CPUs, 2.0/2 GPUs\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-08-09_14-51-51\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m Number of trials: 1/1 (1 RUNNING)\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m +--------------------------------+----------+-----------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m | Trial name | status | loc |\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m |--------------------------------+----------+-----------------|\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m | HuggingFaceTrainer_f2621_00000 | RUNNING | 10.130.4.19:196 |\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m +--------------------------------+----------+-----------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(HuggingFaceTrainer pid=196, ip=10.130.4.19)\u001b[0m 2023-08-09 14:51:57,260\tINFO backend_executor.py:137 -- Starting distributed worker processes: ['235 (10.130.4.19)', '232 (10.129.4.19)']\n", - "\u001b[2m\u001b[36m(HuggingFaceTrainer pid=196, ip=10.130.4.19)\u001b[0m 2023-08-09 14:51:58,957\tINFO streaming_executor.py:91 -- Executing DAG InputDataBuffer[Input] -> AllToAllOperator[RandomizeBlockOrder]\n", - "\u001b[2m\u001b[36m(HuggingFaceTrainer pid=196, ip=10.130.4.19)\u001b[0m 2023-08-09 14:51:58,957\tINFO streaming_executor.py:92 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", - "\u001b[2m\u001b[36m(HuggingFaceTrainer pid=196, ip=10.130.4.19)\u001b[0m 2023-08-09 14:51:58,958\tINFO streaming_executor.py:94 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n", - "\u001b[2m\u001b[36m(HuggingFaceTrainer pid=196, ip=10.130.4.19)\u001b[0m 2023-08-09 14:51:58,969\tINFO streaming_executor.py:149 -- Shutting down .\n", - "\u001b[2m\u001b[36m(RayTrainWorker pid=235, ip=10.130.4.19)\u001b[0m 2023-08-09 14:51:58,912\tINFO config.py:86 -- Setting up process group for: env:// [rank=0, world_size=2]\n" + "\u001B[2m\u001B[36m(HuggingFaceTrainer pid=196, ip=10.130.4.19)\u001B[0m 2023-08-09 14:51:57,260\tINFO backend_executor.py:137 -- Starting distributed worker processes: ['235 (10.130.4.19)', '232 (10.129.4.19)']\n", + "\u001B[2m\u001B[36m(HuggingFaceTrainer pid=196, ip=10.130.4.19)\u001B[0m 2023-08-09 14:51:58,957\tINFO streaming_executor.py:91 -- Executing DAG InputDataBuffer[Input] -> AllToAllOperator[RandomizeBlockOrder]\n", + "\u001B[2m\u001B[36m(HuggingFaceTrainer pid=196, ip=10.130.4.19)\u001B[0m 2023-08-09 14:51:58,957\tINFO streaming_executor.py:92 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", + "\u001B[2m\u001B[36m(HuggingFaceTrainer pid=196, ip=10.130.4.19)\u001B[0m 2023-08-09 14:51:58,958\tINFO streaming_executor.py:94 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n", + "\u001B[2m\u001B[36m(HuggingFaceTrainer pid=196, ip=10.130.4.19)\u001B[0m 2023-08-09 14:51:58,969\tINFO streaming_executor.py:149 -- Shutting down .\n", + "\u001B[2m\u001B[36m(RayTrainWorker pid=235, ip=10.130.4.19)\u001B[0m 2023-08-09 14:51:58,912\tINFO config.py:86 -- Setting up process group for: env:// [rank=0, world_size=2]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m == Status ==\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m Current time: 2023-08-09 14:52:01 (running for 00:00:10.18)\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m Using FIFO scheduling algorithm.\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m Logical resource usage: 1.0/6 CPUs, 2.0/2 GPUs\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-08-09_14-51-51\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m +--------------------------------+----------+-----------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m | Trial name | status | loc |\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m |--------------------------------+----------+-----------------|\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m | HuggingFaceTrainer_f2621_00000 | RUNNING | 10.130.4.19:196 |\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m +--------------------------------+----------+-----------------+\n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=425)\u001b[0m \n" + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m == Status ==\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m Current time: 2023-08-09 14:52:01 (running for 00:00:10.18)\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m Using FIFO scheduling algorithm.\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m Logical resource usage: 1.0/6 CPUs, 2.0/2 GPUs\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-08-09_14-51-51\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m Number of trials: 1/1 (1 RUNNING)\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m +--------------------------------+----------+-----------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m | Trial name | status | loc |\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m |--------------------------------+----------+-----------------|\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m | HuggingFaceTrainer_f2621_00000 | RUNNING | 10.130.4.19:196 |\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m +--------------------------------+----------+-----------------+\n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_fn pid=425)\u001B[0m \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(RayTrainWorker pid=235, ip=10.130.4.19)\u001b[0m 2023-08-09 14:52:01,262\tINFO streaming_executor.py:91 -- Executing DAG InputDataBuffer[Input] -> AllToAllOperator[RandomizeBlockOrder]\n", - "\u001b[2m\u001b[36m(RayTrainWorker pid=235, ip=10.130.4.19)\u001b[0m 2023-08-09 14:52:01,262\tINFO streaming_executor.py:92 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", - "\u001b[2m\u001b[36m(RayTrainWorker pid=235, ip=10.130.4.19)\u001b[0m 2023-08-09 14:52:01,262\tINFO streaming_executor.py:94 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n", - "\u001b[2m\u001b[36m(RayTrainWorker pid=235, ip=10.130.4.19)\u001b[0m 2023-08-09 14:52:01,274\tINFO streaming_executor.py:149 -- Shutting down .\n", - "\u001b[2m\u001b[36m(RayTrainWorker pid=232, ip=10.129.4.19)\u001b[0m 2023-08-09 14:52:01,252\tINFO streaming_executor.py:91 -- Executing DAG InputDataBuffer[Input] -> AllToAllOperator[RandomizeBlockOrder]\n", - "\u001b[2m\u001b[36m(RayTrainWorker pid=232, ip=10.129.4.19)\u001b[0m 2023-08-09 14:52:01,252\tINFO streaming_executor.py:92 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", - "\u001b[2m\u001b[36m(RayTrainWorker pid=232, ip=10.129.4.19)\u001b[0m 2023-08-09 14:52:01,252\tINFO streaming_executor.py:94 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n", - "\u001b[2m\u001b[36m(RayTrainWorker pid=232, ip=10.129.4.19)\u001b[0m 2023-08-09 14:52:01,263\tINFO streaming_executor.py:149 -- Shutting down .\n", + "\u001B[2m\u001B[36m(RayTrainWorker pid=235, ip=10.130.4.19)\u001B[0m 2023-08-09 14:52:01,262\tINFO streaming_executor.py:91 -- Executing DAG InputDataBuffer[Input] -> AllToAllOperator[RandomizeBlockOrder]\n", + "\u001B[2m\u001B[36m(RayTrainWorker pid=235, ip=10.130.4.19)\u001B[0m 2023-08-09 14:52:01,262\tINFO streaming_executor.py:92 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", + "\u001B[2m\u001B[36m(RayTrainWorker pid=235, ip=10.130.4.19)\u001B[0m 2023-08-09 14:52:01,262\tINFO streaming_executor.py:94 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n", + "\u001B[2m\u001B[36m(RayTrainWorker pid=235, ip=10.130.4.19)\u001B[0m 2023-08-09 14:52:01,274\tINFO streaming_executor.py:149 -- Shutting down .\n", + "\u001B[2m\u001B[36m(RayTrainWorker pid=232, ip=10.129.4.19)\u001B[0m 2023-08-09 14:52:01,252\tINFO streaming_executor.py:91 -- Executing DAG InputDataBuffer[Input] -> AllToAllOperator[RandomizeBlockOrder]\n", + "\u001B[2m\u001B[36m(RayTrainWorker pid=232, ip=10.129.4.19)\u001B[0m 2023-08-09 14:52:01,252\tINFO streaming_executor.py:92 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", + "\u001B[2m\u001B[36m(RayTrainWorker pid=232, ip=10.129.4.19)\u001B[0m 2023-08-09 14:52:01,252\tINFO streaming_executor.py:94 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n", + "\u001B[2m\u001B[36m(RayTrainWorker pid=232, ip=10.129.4.19)\u001B[0m 2023-08-09 14:52:01,263\tINFO streaming_executor.py:149 -- Shutting down .\n", "Downloading (…)lve/main/config.json: 100%|██████████| 483/483 [00:00<00:00, 151kB/s]\n", "Downloading (…)lve/main/config.json: 100%|██████████| 483/483 [00:00<00:00, 146kB/s]\n", "Downloading model.safetensors: 0%| | 0.00/268M [00:00=3.8" files = [ - {file = "filelock-3.15.3-py3-none-any.whl", hash = "sha256:0151273e5b5d6cf753a61ec83b3a9b7d8821c39ae9af9d7ecf2f9e2f17404103"}, - {file = "filelock-3.15.3.tar.gz", hash = "sha256:e1199bf5194a2277273dacd50269f0d87d0682088a3c561c15674ea9005d8635"}, + {file = "filelock-3.15.4-py3-none-any.whl", hash = "sha256:6ca1fffae96225dab4c6eaf1c4f4f28cd2568d3ec2a44e15a08520504de468e7"}, + {file = "filelock-3.15.4.tar.gz", hash = "sha256:2207938cbc1844345cb01a5a95524dae30f0ce089eba5b00378295a17e3e90cb"}, ] [package.extras] @@ -734,13 +734,13 @@ files = [ [[package]] name = "fsspec" -version = "2024.6.0" +version = "2024.6.1" description = "File-system specification" optional = false python-versions = ">=3.8" files = [ - {file = "fsspec-2024.6.0-py3-none-any.whl", hash = "sha256:58d7122eb8a1a46f7f13453187bfea4972d66bf01618d37366521b1998034cee"}, - {file = "fsspec-2024.6.0.tar.gz", hash = "sha256:f579960a56e6d8038a9efc8f9c77279ec12e6299aa86b0769a7e9c46b94527c2"}, + {file = "fsspec-2024.6.1-py3-none-any.whl", hash = "sha256:3cb443f8bcd2efb31295a5b9fdb02aee81d8452c80d28f97a6d0959e6cee101e"}, + {file = "fsspec-2024.6.1.tar.gz", hash = "sha256:fad7d7e209dd4c1208e3bbfda706620e0da5142bebbd9c384afb95b07e798e49"}, ] [package.extras] @@ -773,25 +773,26 @@ tqdm = ["tqdm"] [[package]] name = "google-api-core" -version = "2.8.0" +version = "2.19.1" description = "Google API client core library" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" files = [ - {file = "google-api-core-2.8.0.tar.gz", hash = "sha256:065bb8e11c605fd232707ae50963dc1c8af5b3c95b4568887515985e6c1156b3"}, - {file = "google_api_core-2.8.0-py3-none-any.whl", hash = "sha256:1b9f59236ce1bae9a687c1d4f22957e79a2669e53d032893f6bf0fca54f6931d"}, + {file = "google-api-core-2.19.1.tar.gz", hash = "sha256:f4695f1e3650b316a795108a76a1c416e6afb036199d1c1f1f110916df479ffd"}, + {file = "google_api_core-2.19.1-py3-none-any.whl", hash = "sha256:f12a9b8309b5e21d92483bbd47ce2c445861ec7d269ef6784ecc0ea8c1fa6125"}, ] [package.dependencies] -google-auth = ">=1.25.0,<3.0dev" -googleapis-common-protos = ">=1.52.0,<2.0dev" -protobuf = ">=3.12.0" -requests = ">=2.18.0,<3.0.0dev" +google-auth = ">=2.14.1,<3.0.dev0" +googleapis-common-protos = ">=1.56.2,<2.0.dev0" +proto-plus = ">=1.22.3,<2.0.0dev" +protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0.dev0" +requests = ">=2.18.0,<3.0.0.dev0" [package.extras] -grpc = ["grpcio (>=1.33.2,<2.0dev)", "grpcio-status (>=1.33.2,<2.0dev)"] -grpcgcp = ["grpcio-gcp (>=0.2.2)"] -grpcio-gcp = ["grpcio-gcp (>=0.2.2)"] +grpc = ["grpcio (>=1.33.2,<2.0dev)", "grpcio (>=1.49.1,<2.0dev)", "grpcio-status (>=1.33.2,<2.0.dev0)", "grpcio-status (>=1.49.1,<2.0.dev0)"] +grpcgcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"] +grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"] [[package]] name = "google-auth" @@ -818,17 +819,17 @@ requests = ["requests (>=2.20.0,<3.0.0.dev0)"] [[package]] name = "googleapis-common-protos" -version = "1.63.1" +version = "1.63.2" description = "Common protobufs used in Google APIs" optional = false python-versions = ">=3.7" files = [ - {file = "googleapis-common-protos-1.63.1.tar.gz", hash = "sha256:c6442f7a0a6b2a80369457d79e6672bb7dcbaab88e0848302497e3ec80780a6a"}, - {file = "googleapis_common_protos-1.63.1-py2.py3-none-any.whl", hash = "sha256:0e1c2cdfcbc354b76e4a211a35ea35d6926a835cba1377073c4861db904a1877"}, + {file = "googleapis-common-protos-1.63.2.tar.gz", hash = "sha256:27c5abdffc4911f28101e635de1533fb4cfd2c37fbaa9174587c799fac90aa87"}, + {file = "googleapis_common_protos-1.63.2-py2.py3-none-any.whl", hash = "sha256:27a2499c7e8aff199665b22741997e485eccc8645aa9176c7c988e6fae507945"}, ] [package.dependencies] -protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0.dev0" +protobuf = ">=3.20.2,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0.dev0" [package.extras] grpc = ["grpcio (>=1.44.0,<2.0.0.dev0)"] @@ -904,13 +905,13 @@ files = [ [[package]] name = "importlib-metadata" -version = "7.2.0" +version = "8.0.0" description = "Read metadata from Python packages" optional = false python-versions = ">=3.8" files = [ - {file = "importlib_metadata-7.2.0-py3-none-any.whl", hash = "sha256:04e4aad329b8b948a5711d394fa8759cb80f009225441b4f2a02bd4d8e5f426c"}, - {file = "importlib_metadata-7.2.0.tar.gz", hash = "sha256:3ff4519071ed42740522d494d04819b666541b9752c43012f85afb2cc220fcc6"}, + {file = "importlib_metadata-8.0.0-py3-none-any.whl", hash = "sha256:15584cf2b1bf449d98ff8a6ff1abef57bf20f3ac6454f431736cd3e660921b2f"}, + {file = "importlib_metadata-8.0.0.tar.gz", hash = "sha256:188bd24e4c346d3f0a933f275c2fec67050326a856b9a359881d7c2a697e8812"}, ] [package.dependencies] @@ -1758,24 +1759,41 @@ files = [ [package.dependencies] wcwidth = "*" +[[package]] +name = "proto-plus" +version = "1.24.0" +description = "Beautiful, Pythonic protocol buffers." +optional = false +python-versions = ">=3.7" +files = [ + {file = "proto-plus-1.24.0.tar.gz", hash = "sha256:30b72a5ecafe4406b0d339db35b56c4059064e69227b8c3bda7462397f966445"}, + {file = "proto_plus-1.24.0-py3-none-any.whl", hash = "sha256:402576830425e5f6ce4c2a6702400ac79897dab0b4343821aa5188b0fab81a12"}, +] + +[package.dependencies] +protobuf = ">=3.19.0,<6.0.0dev" + +[package.extras] +testing = ["google-api-core (>=1.31.5)"] + [[package]] name = "protobuf" -version = "5.27.1" +version = "5.27.2" description = "" optional = false python-versions = ">=3.8" files = [ - {file = "protobuf-5.27.1-cp310-abi3-win32.whl", hash = "sha256:3adc15ec0ff35c5b2d0992f9345b04a540c1e73bfee3ff1643db43cc1d734333"}, - {file = "protobuf-5.27.1-cp310-abi3-win_amd64.whl", hash = "sha256:25236b69ab4ce1bec413fd4b68a15ef8141794427e0b4dc173e9d5d9dffc3bcd"}, - {file = "protobuf-5.27.1-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:4e38fc29d7df32e01a41cf118b5a968b1efd46b9c41ff515234e794011c78b17"}, - {file = "protobuf-5.27.1-cp38-abi3-manylinux2014_aarch64.whl", hash = "sha256:917ed03c3eb8a2d51c3496359f5b53b4e4b7e40edfbdd3d3f34336e0eef6825a"}, - {file = "protobuf-5.27.1-cp38-abi3-manylinux2014_x86_64.whl", hash = "sha256:ee52874a9e69a30271649be88ecbe69d374232e8fd0b4e4b0aaaa87f429f1631"}, - {file = "protobuf-5.27.1-cp38-cp38-win32.whl", hash = "sha256:7a97b9c5aed86b9ca289eb5148df6c208ab5bb6906930590961e08f097258107"}, - {file = "protobuf-5.27.1-cp38-cp38-win_amd64.whl", hash = "sha256:f6abd0f69968792da7460d3c2cfa7d94fd74e1c21df321eb6345b963f9ec3d8d"}, - {file = "protobuf-5.27.1-cp39-cp39-win32.whl", hash = "sha256:dfddb7537f789002cc4eb00752c92e67885badcc7005566f2c5de9d969d3282d"}, - {file = "protobuf-5.27.1-cp39-cp39-win_amd64.whl", hash = "sha256:39309898b912ca6febb0084ea912e976482834f401be35840a008da12d189340"}, - {file = "protobuf-5.27.1-py3-none-any.whl", hash = "sha256:4ac7249a1530a2ed50e24201d6630125ced04b30619262f06224616e0030b6cf"}, - {file = "protobuf-5.27.1.tar.gz", hash = "sha256:df5e5b8e39b7d1c25b186ffdf9f44f40f810bbcc9d2b71d9d3156fee5a9adf15"}, + {file = "protobuf-5.27.2-cp310-abi3-win32.whl", hash = "sha256:354d84fac2b0d76062e9b3221f4abbbacdfd2a4d8af36bab0474f3a0bb30ab38"}, + {file = "protobuf-5.27.2-cp310-abi3-win_amd64.whl", hash = "sha256:0e341109c609749d501986b835f667c6e1e24531096cff9d34ae411595e26505"}, + {file = "protobuf-5.27.2-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:a109916aaac42bff84702fb5187f3edadbc7c97fc2c99c5ff81dd15dcce0d1e5"}, + {file = "protobuf-5.27.2-cp38-abi3-manylinux2014_aarch64.whl", hash = "sha256:176c12b1f1c880bf7a76d9f7c75822b6a2bc3db2d28baa4d300e8ce4cde7409b"}, + {file = "protobuf-5.27.2-cp38-abi3-manylinux2014_x86_64.whl", hash = "sha256:b848dbe1d57ed7c191dfc4ea64b8b004a3f9ece4bf4d0d80a367b76df20bf36e"}, + {file = "protobuf-5.27.2-cp38-cp38-win32.whl", hash = "sha256:4fadd8d83e1992eed0248bc50a4a6361dc31bcccc84388c54c86e530b7f58863"}, + {file = "protobuf-5.27.2-cp38-cp38-win_amd64.whl", hash = "sha256:610e700f02469c4a997e58e328cac6f305f649826853813177e6290416e846c6"}, + {file = "protobuf-5.27.2-cp39-cp39-win32.whl", hash = "sha256:9e8f199bf7f97bd7ecebffcae45ebf9527603549b2b562df0fbc6d4d688f14ca"}, + {file = "protobuf-5.27.2-cp39-cp39-win_amd64.whl", hash = "sha256:7fc3add9e6003e026da5fc9e59b131b8f22b428b991ccd53e2af8071687b4fce"}, + {file = "protobuf-5.27.2-py3-none-any.whl", hash = "sha256:54330f07e4949d09614707c48b06d1a22f8ffb5763c159efd5c0928326a91470"}, + {file = "protobuf-5.27.2.tar.gz", hash = "sha256:f3ecdef226b9af856075f28227ff2c90ce3a594d092c39bee5513573f25e2714"}, ] [[package]] @@ -2142,26 +2160,20 @@ files = [ [[package]] name = "ray" -version = "2.20.0" +version = "2.23.0" description = "Ray provides a simple, universal API for building distributed applications." optional = false python-versions = ">=3.8" files = [ - {file = "ray-2.20.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:8855a5df8b3e6b8bcb5582a8491c50d0237e70751f941e8978bd6408245b7838"}, - {file = "ray-2.20.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c0566b28c75aad1d47b9403c3901a85db586ce7191fdc6978e07ad56e80bf82b"}, - {file = "ray-2.20.0-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:738c68f4114754f846b3d03b730b42a6468f8b54665732da9f9108aa1d3ecbe3"}, - {file = "ray-2.20.0-cp310-cp310-manylinux2014_x86_64.whl", hash = "sha256:2c7f8cd468cbba009d7ebd8a8da66026aeb520f7f4183dd6f49419d75bc84415"}, - {file = "ray-2.20.0-cp310-cp310-win_amd64.whl", hash = "sha256:611d34d0c659652a38ef482a82dfc362074984617765e1d5a414337e4f914cfd"}, - {file = "ray-2.20.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:f7816767e644014f65afbfceb6adfb08c15784a4227aa331b28ac90d1b757a58"}, - {file = "ray-2.20.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8e98df29fd6dac52c87c1f5be5ad99601a8955eaabe921e5cab29b27775250ce"}, - {file = "ray-2.20.0-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:e84ddad1521e06c91fc641f2b856d33ca2bfa314784172862c41a5184e0e760b"}, - {file = "ray-2.20.0-cp311-cp311-manylinux2014_x86_64.whl", hash = "sha256:d9b13815fae5c9a68c9a02f21e1c49c58a5bb6565cb9ed5d48571cacce7568f2"}, - {file = "ray-2.20.0-cp311-cp311-win_amd64.whl", hash = "sha256:6ac1dcb303ddf53d2d87bc5b719e8c38f0a5efe41e175b6ba563fb65b5f4e9a2"}, - {file = "ray-2.20.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:1de0810f77ae4a0bf055aa2bdcb161be1d6d1b67b4095e85a5b3fbb6e0dadcd2"}, - {file = "ray-2.20.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3f3519dd7794ead4d3e17d4570593b2a10e8db062836907517e85b4e769dec1a"}, - {file = "ray-2.20.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:5a2cb9f100bbb6351372519b03ddc21d9fa6c8716621237273a59a6e250a8204"}, - {file = "ray-2.20.0-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:64b394a6462a2ac2401b1b004f2cc7ac31e429388abf27024072a55702f1159c"}, - {file = "ray-2.20.0-cp39-cp39-win_amd64.whl", hash = "sha256:65938f7bd28a825d90c643465ad6b1334d97d16e381c409b19269e4dcc043341"}, + {file = "ray-2.23.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:4f5ea8dc8fc014704ea12ef8a569abf0deca2ba2a6f157dc5fdd1789db4e0a65"}, + {file = "ray-2.23.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fc950898871c3ecf3b921295c5fcf47b4a30b57b54be8f369014fb1eb9b4cfa5"}, + {file = "ray-2.23.0-cp310-cp310-manylinux2014_x86_64.whl", hash = "sha256:55610f8eae65ce5686bde75a5782ce63e2a0112ccd2262b8acd707264da6dbea"}, + {file = "ray-2.23.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:1a43d94ce3f14490e6f1e3e868fd6a5f3be4878cbf83c4bcdc741861d6a4dbf6"}, + {file = "ray-2.23.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f2d2c1d59d7c8bd8b97288f7ae9a6bf762bd4e703b57787282400d3176dd159d"}, + {file = "ray-2.23.0-cp311-cp311-manylinux2014_x86_64.whl", hash = "sha256:15c109fd9969326323c8bdb0701cd9af21c85f465002f74950622f9a580ec4e5"}, + {file = "ray-2.23.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:e7d059f094dedae36dddeaf792ebb74d4eed1a8ab1fb540dbffce4ac22694800"}, + {file = "ray-2.23.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7c305f31674fb8319c147d66e27dd210c7ad6d375626307ddfc62137a26d4155"}, + {file = "ray-2.23.0-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:b40f85c67ee3d58732b4021460c4297eb418f466313d70b577e5bf9fbb4c2d16"}, ] [package.dependencies] @@ -2196,9 +2208,9 @@ virtualenv = {version = ">=20.0.24,<20.21.1 || >20.21.1", optional = true, marke [package.extras] air = ["aiohttp (>=3.7)", "aiohttp-cors", "colorful", "fastapi", "fsspec", "grpcio (>=1.32.0)", "grpcio (>=1.42.0)", "memray", "numpy (>=1.20)", "opencensus", "pandas", "pandas (>=1.3)", "prometheus-client (>=0.7.1)", "py-spy (>=0.2.0)", "pyarrow (>=6.0.1)", "pydantic (<2.0.dev0 || >=2.5.dev0,<3)", "requests", "smart-open", "starlette", "tensorboardX (>=1.9)", "uvicorn[standard]", "virtualenv (>=20.0.24,!=20.21.1)", "watchfiles"] -all = ["aiohttp (>=3.7)", "aiohttp-cors", "colorful", "dm-tree", "fastapi", "fsspec", "grpcio", "grpcio (>=1.32.0)", "grpcio (>=1.42.0)", "gymnasium (==0.28.1)", "lz4", "memray", "numpy (>=1.20)", "opencensus", "opentelemetry-api", "opentelemetry-exporter-otlp", "opentelemetry-sdk", "pandas", "pandas (>=1.3)", "prometheus-client (>=0.7.1)", "py-spy (>=0.2.0)", "pyarrow (>=6.0.1)", "pydantic (<2.0.dev0 || >=2.5.dev0,<3)", "pyyaml", "ray-cpp (==2.20.0)", "requests", "rich", "scikit-image", "scipy", "smart-open", "starlette", "tensorboardX (>=1.9)", "typer", "uvicorn[standard]", "virtualenv (>=20.0.24,!=20.21.1)", "watchfiles"] -client = ["grpcio"] -cpp = ["ray-cpp (==2.20.0)"] +all = ["aiohttp (>=3.7)", "aiohttp-cors", "colorful", "dm-tree", "fastapi", "fsspec", "grpcio (!=1.56.0)", "grpcio (>=1.32.0)", "grpcio (>=1.42.0)", "gymnasium (==0.28.1)", "lz4", "memray", "numpy (>=1.20)", "opencensus", "opentelemetry-api", "opentelemetry-exporter-otlp", "opentelemetry-sdk", "pandas", "pandas (>=1.3)", "prometheus-client (>=0.7.1)", "py-spy (>=0.2.0)", "pyarrow (>=6.0.1)", "pydantic (<2.0.dev0 || >=2.5.dev0,<3)", "pyyaml", "ray-cpp (==2.23.0)", "requests", "rich", "scikit-image", "scipy", "smart-open", "starlette", "tensorboardX (>=1.9)", "typer", "uvicorn[standard]", "virtualenv (>=20.0.24,!=20.21.1)", "watchfiles"] +client = ["grpcio (!=1.56.0)"] +cpp = ["ray-cpp (==2.23.0)"] data = ["fsspec", "numpy (>=1.20)", "pandas (>=1.3)", "pyarrow (>=6.0.1)"] default = ["aiohttp (>=3.7)", "aiohttp-cors", "colorful", "grpcio (>=1.32.0)", "grpcio (>=1.42.0)", "memray", "opencensus", "prometheus-client (>=0.7.1)", "py-spy (>=0.2.0)", "pydantic (<2.0.dev0 || >=2.5.dev0,<3)", "requests", "smart-open", "virtualenv (>=20.0.24,!=20.21.1)"] observability = ["opentelemetry-api", "opentelemetry-exporter-otlp", "opentelemetry-sdk"] @@ -2404,13 +2416,13 @@ pyasn1 = ">=0.1.3" [[package]] name = "setuptools" -version = "70.1.0" +version = "70.1.1" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false python-versions = ">=3.8" files = [ - {file = "setuptools-70.1.0-py3-none-any.whl", hash = "sha256:d9b8b771455a97c8a9f3ab3448ebe0b29b5e105f1228bba41028be116985a267"}, - {file = "setuptools-70.1.0.tar.gz", hash = "sha256:01a1e793faa5bd89abc851fa15d0a0db26f160890c7102cd8dce643e886b47f5"}, + {file = "setuptools-70.1.1-py3-none-any.whl", hash = "sha256:a58a8fde0541dab0419750bcc521fbdf8585f6e5cb41909df3a472ef7b81ca95"}, + {file = "setuptools-70.1.1.tar.gz", hash = "sha256:937a48c7cdb7a21eb53cd7f9b59e525503aa8abaf3584c730dc5f7a5bec3a650"}, ] [package.extras] @@ -2539,13 +2551,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "virtualenv" -version = "20.26.2" +version = "20.26.3" description = "Virtual Python Environment builder" optional = false python-versions = ">=3.7" files = [ - {file = "virtualenv-20.26.2-py3-none-any.whl", hash = "sha256:a624db5e94f01ad993d476b9ee5346fdf7b9de43ccaee0e0197012dc838a0e9b"}, - {file = "virtualenv-20.26.2.tar.gz", hash = "sha256:82bf0f4eebbb78d36ddaee0283d43fe5736b53880b8a8cdcd37390a07ac3741c"}, + {file = "virtualenv-20.26.3-py3-none-any.whl", hash = "sha256:8cc4a31139e796e9a7de2cd5cf2489de1217193116a8fd42328f1bd65f434589"}, + {file = "virtualenv-20.26.3.tar.gz", hash = "sha256:4c43a2a236279d9ea36a0d76f98d84bd6ca94ac4e0f4a3b9d46d05e10fea542a"}, ] [package.dependencies] @@ -2795,4 +2807,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "d656bab99c2e5a911ee1003db9e0682141328ae3ef1e1620945f8479451425bf" +content-hash = "70c25813b589204a08c4143380cb8fe06de901dd6a83162b8f93a51945072629" diff --git a/pyproject.toml b/pyproject.toml index af7dd1ca..457e6de9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,7 @@ keywords = ['codeflare', 'python', 'sdk', 'client', 'batch', 'scale'] python = "^3.9" openshift-client = "1.0.18" rich = "^12.5" -ray = {version = "2.20.0", extras = ["data", "default"]} +ray = {version = "2.23.0", extras = ["data", "default"]} kubernetes = ">= 25.3.0, < 27" cryptography = "40.0.2" executing = "1.2.0" diff --git a/src/codeflare_sdk/templates/base-template.yaml b/src/codeflare_sdk/templates/base-template.yaml index 5c0c919d..7b36146a 100644 --- a/src/codeflare_sdk/templates/base-template.yaml +++ b/src/codeflare_sdk/templates/base-template.yaml @@ -12,7 +12,7 @@ metadata: namespace: default spec: # The version of Ray you are using. Make sure all Ray containers are running this version of Ray. - rayVersion: '2.20.0' + rayVersion: '2.23.0' # If enableInTreeAutoscaling is true, the autoscaler sidecar will be added to the Ray head pod. # Ray autoscaler integration is supported only for Ray versions >= 1.11.0 # Ray autoscaler integration is Beta with KubeRay >= 0.3.0 and Ray >= 2.0.0. @@ -69,7 +69,7 @@ spec: containers: # The Ray head pod - name: ray-head - image: quay.io/project-codeflare/ray:2.20.0-py39-cu118 + image: quay.io/rhoai/ray:2.23.0-py39-cu121 imagePullPolicy: Always ports: - containerPort: 6379 @@ -152,7 +152,7 @@ spec: spec: containers: - name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc' - image: quay.io/project-codeflare/ray:2.20.0-py39-cu118 + image: quay.io/rhoai/ray:2.23.0-py39-cu121 # environment variables to set in the container.Optional. # Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/ lifecycle: diff --git a/tests/e2e/support.py b/tests/e2e/support.py index 04c9cb42..d8a06bb7 100644 --- a/tests/e2e/support.py +++ b/tests/e2e/support.py @@ -8,7 +8,7 @@ def get_ray_image(): - default_ray_image = "quay.io/project-codeflare/ray:2.20.0-py39-cu118" + default_ray_image = "quay.io/rhoai/ray:2.23.0-py39-cu121" return os.getenv("RAY_IMAGE", default_ray_image) diff --git a/tests/test-case-bad.yaml b/tests/test-case-bad.yaml index 18dcb7d7..d4d230d4 100644 --- a/tests/test-case-bad.yaml +++ b/tests/test-case-bad.yaml @@ -42,7 +42,7 @@ spec: valueFrom: fieldRef: fieldPath: status.podIP - image: quay.io/project-codeflare/ray:2.20.0-py39-cu118 + image: quay.io/rhoai/ray:2.23.0-py39-cu121 imagePullPolicy: Always lifecycle: preStop: @@ -68,7 +68,7 @@ spec: cpu: 2 memory: 8G nvidia.com/gpu: 0 - rayVersion: 2.20.0 + rayVersion: 2.23.0 workerGroupSpecs: - groupName: small-group-unit-test-cluster maxReplicas: 2 @@ -90,7 +90,7 @@ spec: valueFrom: fieldRef: fieldPath: status.podIP - image: quay.io/project-codeflare/ray:2.20.0-py39-cu118 + image: quay.io/rhoai/ray:2.23.0-py39-cu121 lifecycle: preStop: exec: diff --git a/tests/test-case-no-mcad.yamls b/tests/test-case-no-mcad.yamls index d8d2516c..2d0e7e9b 100644 --- a/tests/test-case-no-mcad.yamls +++ b/tests/test-case-no-mcad.yamls @@ -33,7 +33,7 @@ spec: template: spec: containers: - - image: quay.io/project-codeflare/ray:2.20.0-py39-cu118 + - image: quay.io/rhoai/ray:2.23.0-py39-cu121 imagePullPolicy: Always lifecycle: preStop: @@ -89,7 +89,7 @@ spec: name: odh-trusted-ca-bundle optional: true name: odh-ca-cert - rayVersion: 2.20.0 + rayVersion: 2.23.0 workerGroupSpecs: - groupName: small-group-unit-test-cluster-ray maxReplicas: 2 @@ -106,7 +106,7 @@ spec: key: value spec: containers: - - image: quay.io/project-codeflare/ray:2.20.0-py39-cu118 + - image: quay.io/rhoai/ray:2.23.0-py39-cu121 lifecycle: preStop: exec: diff --git a/tests/test-case.yaml b/tests/test-case.yaml index c5229ce7..00b241af 100644 --- a/tests/test-case.yaml +++ b/tests/test-case.yaml @@ -38,7 +38,7 @@ spec: template: spec: containers: - - image: quay.io/project-codeflare/ray:2.20.0-py39-cu118 + - image: quay.io/rhoai/ray:2.23.0-py39-cu121 imagePullPolicy: Always lifecycle: preStop: @@ -94,7 +94,7 @@ spec: name: odh-trusted-ca-bundle optional: true name: odh-ca-cert - rayVersion: 2.20.0 + rayVersion: 2.23.0 workerGroupSpecs: - groupName: small-group-unit-test-cluster maxReplicas: 2 @@ -111,7 +111,7 @@ spec: key: value spec: containers: - - image: quay.io/project-codeflare/ray:2.20.0-py39-cu118 + - image: quay.io/rhoai/ray:2.23.0-py39-cu121 lifecycle: preStop: exec: diff --git a/tests/test-default-appwrapper.yaml b/tests/test-default-appwrapper.yaml index 8fd1873f..cc44e234 100644 --- a/tests/test-default-appwrapper.yaml +++ b/tests/test-default-appwrapper.yaml @@ -38,7 +38,7 @@ spec: template: spec: containers: - - image: quay.io/project-codeflare/ray:2.20.0-py39-cu118 + - image: quay.io/rhoai/ray:2.23.0-py39-cu121 imagePullPolicy: Always lifecycle: preStop: @@ -93,7 +93,7 @@ spec: name: odh-trusted-ca-bundle optional: true name: odh-ca-cert - rayVersion: 2.20.0 + rayVersion: 2.23.0 workerGroupSpecs: - groupName: small-group-unit-test-default-cluster maxReplicas: 1 @@ -110,7 +110,7 @@ spec: key: value spec: containers: - - image: quay.io/project-codeflare/ray:2.20.0-py39-cu118 + - image: quay.io/rhoai/ray:2.23.0-py39-cu121 lifecycle: preStop: exec: diff --git a/tests/unit_test.py b/tests/unit_test.py index 61870b2a..6346d462 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -261,7 +261,7 @@ def test_config_creation(): assert config.min_cpus == 3 and config.max_cpus == 4 assert config.min_memory == "5G" and config.max_memory == "6G" assert config.num_gpus == 7 - assert config.image == "quay.io/project-codeflare/ray:2.20.0-py39-cu118" + assert config.image == "quay.io/rhoai/ray:2.23.0-py39-cu121" assert config.template == f"{parent}/src/codeflare_sdk/templates/base-template.yaml" assert config.machine_types == ["cpu.small", "gpu.large"] assert config.image_pull_secrets == ["unit-test-pull-secret"] @@ -400,7 +400,7 @@ def test_cluster_creation_no_mcad_local_queue(mocker): num_gpus=7, machine_types=["cpu.small", "gpu.large"], image_pull_secrets=["unit-test-pull-secret"], - image="quay.io/project-codeflare/ray:2.20.0-py39-cu118", + image="quay.io/rhoai/ray:2.23.0-py39-cu121", write_to_file=True, appwrapper=False, local_queue="local-queue-default", @@ -428,7 +428,7 @@ def test_default_cluster_creation(mocker): ) default_config = ClusterConfiguration( name="unit-test-default-cluster", - image="quay.io/project-codeflare/ray:2.20.0-py39-cu118", + image="quay.io/rhoai/ray:2.23.0-py39-cu121", appwrapper=True, ) cluster = Cluster(default_config) @@ -777,7 +777,7 @@ def test_ray_job_wrapping(mocker): return_value=get_local_queue("kueue.x-k8s.io", "v1beta1", "ns", "localqueues"), ) cluster = cluster = createClusterWithConfig(mocker) - cluster.config.image = "quay.io/project-codeflare/ray:2.20.0-py39-cu118" + cluster.config.image = "quay.io/rhoai/ray:2.23.0-py39-cu121" mocker.patch( "ray.job_submission.JobSubmissionClient._check_connection_and_version_with_url", return_value="None", @@ -897,7 +897,7 @@ def test_ray_details(mocker, capsys): ClusterConfiguration( name="raytest2", namespace="ns", - image="quay.io/project-codeflare/ray:2.20.0-py39-cu118", + image="quay.io/rhoai/ray:2.23.0-py39-cu121", write_to_file=True, appwrapper=True, local_queue="local_default_queue", @@ -1183,7 +1183,7 @@ def get_ray_obj(group, version, namespace, plural, cls=None): }, }, }, - "rayVersion": "2.20.0", + "rayVersion": "2.23.0", "workerGroupSpecs": [ { "groupName": "small-group-quicktest", @@ -1417,7 +1417,7 @@ def get_ray_obj(group, version, namespace, plural, cls=None): } }, }, - "rayVersion": "2.20.0", + "rayVersion": "2.23.0", "workerGroupSpecs": [ { "groupName": "small-group-quicktest2", @@ -1790,7 +1790,7 @@ def get_aw_obj(group, version, namespace, plural): } }, }, - "rayVersion": "2.20.0", + "rayVersion": "2.23.0", "workerGroupSpecs": [ { "groupName": "small-group-quicktest", @@ -2294,7 +2294,7 @@ def test_cluster_status(mocker): ClusterConfiguration( name="test", namespace="ns", - image="quay.io/project-codeflare/ray:2.20.0-py39-cu118", + image="quay.io/rhoai/ray:2.23.0-py39-cu121", write_to_file=True, appwrapper=True, local_queue="local_default_queue", @@ -2389,7 +2389,7 @@ def test_wait_ready(mocker, capsys): ClusterConfiguration( name="test", namespace="ns", - image="quay.io/project-codeflare/ray:2.20.0-py39-cu118", + image="quay.io/rhoai/ray:2.23.0-py39-cu121", write_to_file=True, appwrapper=True, local_queue="local-queue-default", @@ -2616,7 +2616,7 @@ def throw_if_getting_raycluster(group, version, namespace, plural): cluster = Cluster( ClusterConfiguration( "test_cluster", - image="quay.io/project-codeflare/ray:2.20.0-py39-cu118", + image="quay.io/rhoai/ray:2.23.0-py39-cu121", write_to_file=False, ) ) diff --git a/tests/unit_test_support.py b/tests/unit_test_support.py index 6e662db4..9e7a60b6 100644 --- a/tests/unit_test_support.py +++ b/tests/unit_test_support.py @@ -17,7 +17,7 @@ def createClusterConfig(): appwrapper=True, machine_types=["cpu.small", "gpu.large"], image_pull_secrets=["unit-test-pull-secret"], - image="quay.io/project-codeflare/ray:2.20.0-py39-cu118", + image="quay.io/rhoai/ray:2.23.0-py39-cu121", write_to_file=True, ) return config From da54cc70bec0b2206ad2a36f80c3b8e02dd71fb9 Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Fri, 28 Jun 2024 17:32:16 +0200 Subject: [PATCH 2/2] test: Upgrade pytorch_lightning to fix invalid metadata --- .../guided-demos/notebook-ex-outputs/requirements.txt | 2 +- demo-notebooks/guided-demos/preview_nbs/requirements.txt | 2 +- tests/e2e/mnist_pip_requirements.txt | 2 +- tests/unit_test.py | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/demo-notebooks/guided-demos/notebook-ex-outputs/requirements.txt b/demo-notebooks/guided-demos/notebook-ex-outputs/requirements.txt index 7266b064..e77d612b 100644 --- a/demo-notebooks/guided-demos/notebook-ex-outputs/requirements.txt +++ b/demo-notebooks/guided-demos/notebook-ex-outputs/requirements.txt @@ -1,4 +1,4 @@ -pytorch_lightning==1.5.10 +pytorch_lightning==1.9.5 ray_lightning torchmetrics==0.9.1 torchvision==0.12.0 diff --git a/demo-notebooks/guided-demos/preview_nbs/requirements.txt b/demo-notebooks/guided-demos/preview_nbs/requirements.txt index 7266b064..e77d612b 100644 --- a/demo-notebooks/guided-demos/preview_nbs/requirements.txt +++ b/demo-notebooks/guided-demos/preview_nbs/requirements.txt @@ -1,4 +1,4 @@ -pytorch_lightning==1.5.10 +pytorch_lightning==1.9.5 ray_lightning torchmetrics==0.9.1 torchvision==0.12.0 diff --git a/tests/e2e/mnist_pip_requirements.txt b/tests/e2e/mnist_pip_requirements.txt index 87edeef2..4c9d5fcb 100644 --- a/tests/e2e/mnist_pip_requirements.txt +++ b/tests/e2e/mnist_pip_requirements.txt @@ -1,3 +1,3 @@ -pytorch_lightning==1.5.10 +pytorch_lightning==1.9.5 torchmetrics==0.9.1 torchvision==0.12.0 diff --git a/tests/unit_test.py b/tests/unit_test.py index 6346d462..e8fa61c3 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -2767,8 +2767,8 @@ def test_rjc_tail_job_logs(ray_job_client, mocker): def test_rjc_list_jobs(ray_job_client, mocker): jobs_list = [ - "JobDetails(type=, job_id=None, submission_id='raysubmit_4k2NYS1YbRXYPZCM', driver_info=None, status=, entrypoint='python mnist.py', message='Job finished successfully.', error_type=None, start_time=1701352132585, end_time=1701352192002, metadata={}, runtime_env={'working_dir': 'gcs://_ray_pkg_6200b93a110e8033.zip', 'pip': {'packages': ['pytorch_lightning==1.5.10', 'ray_lightning', 'torchmetrics==0.9.1', 'torchvision==0.12.0'], 'pip_check': False}, '_ray_commit': 'b4bba4717f5ba04ee25580fe8f88eed63ef0c5dc'}, driver_agent_http_address='http://10.131.0.18:52365', driver_node_id='9fb515995f5fb13ad4db239ceea378333bebf0a2d45b6aa09d02e691')", - "JobDetails(type=, job_id=None, submission_id='raysubmit_iRuwU8vdkbUZZGvT', driver_info=None, status=, entrypoint='python mnist.py', message='Job was intentionally stopped.', error_type=None, start_time=1701353096163, end_time=1701353097733, metadata={}, runtime_env={'working_dir': 'gcs://_ray_pkg_6200b93a110e8033.zip', 'pip': {'packages': ['pytorch_lightning==1.5.10', 'ray_lightning', 'torchmetrics==0.9.1', 'torchvision==0.12.0'], 'pip_check': False}, '_ray_commit': 'b4bba4717f5ba04ee25580fe8f88eed63ef0c5dc'}, driver_agent_http_address='http://10.131.0.18:52365', driver_node_id='9fb515995f5fb13ad4db239ceea378333bebf0a2d45b6aa09d02e691')", + "JobDetails(type=, job_id=None, submission_id='raysubmit_4k2NYS1YbRXYPZCM', driver_info=None, status=, entrypoint='python mnist.py', message='Job finished successfully.', error_type=None, start_time=1701352132585, end_time=1701352192002, metadata={}, runtime_env={'working_dir': 'gcs://_ray_pkg_6200b93a110e8033.zip', 'pip': {'packages': ['pytorch_lightning==1.9.5', 'ray_lightning', 'torchmetrics==0.9.1', 'torchvision==0.12.0'], 'pip_check': False}, '_ray_commit': 'b4bba4717f5ba04ee25580fe8f88eed63ef0c5dc'}, driver_agent_http_address='http://10.131.0.18:52365', driver_node_id='9fb515995f5fb13ad4db239ceea378333bebf0a2d45b6aa09d02e691')", + "JobDetails(type=, job_id=None, submission_id='raysubmit_iRuwU8vdkbUZZGvT', driver_info=None, status=, entrypoint='python mnist.py', message='Job was intentionally stopped.', error_type=None, start_time=1701353096163, end_time=1701353097733, metadata={}, runtime_env={'working_dir': 'gcs://_ray_pkg_6200b93a110e8033.zip', 'pip': {'packages': ['pytorch_lightning==1.9.5', 'ray_lightning', 'torchmetrics==0.9.1', 'torchvision==0.12.0'], 'pip_check': False}, '_ray_commit': 'b4bba4717f5ba04ee25580fe8f88eed63ef0c5dc'}, driver_agent_http_address='http://10.131.0.18:52365', driver_node_id='9fb515995f5fb13ad4db239ceea378333bebf0a2d45b6aa09d02e691')", ] mocked_rjc_list_jobs = mocker.patch.object( JobSubmissionClient, "list_jobs", return_value=jobs_list