diff --git "a/VSF/demo.ipynb" "b/VSF/demo.ipynb" deleted file mode 100644--- "a/VSF/demo.ipynb" +++ /dev/null @@ -1,506 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "829d28e9", - "metadata": {}, - "outputs": [], - "source": [ - "from src.pipeline import VSFStableDiffusion3Pipeline" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7b67ac63", - "metadata": {}, - "outputs": [], - "source": [ - "import torch\n", - "pipe = VSFStableDiffusion3Pipeline.from_pretrained(\n", - " \"stabilityai/stable-diffusion-3.5-large-turbo\",\n", - " torch_dtype=torch.bfloat16,\n", - ").to(\"cuda\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "48cde82c", - "metadata": {}, - "outputs": [], - "source": [ - "import torch\n", - "from nag import NAGStableDiffusion3Pipeline\n", - "\n", - "pipe_nag = NAGStableDiffusion3Pipeline(\n", - " vae=pipe.vae,\n", - " text_encoder=pipe.text_encoder,\n", - " text_encoder_2=pipe.text_encoder_2,\n", - " text_encoder_3=pipe.text_encoder_3,\n", - " tokenizer=pipe.tokenizer,\n", - " tokenizer_2=pipe.tokenizer_2,\n", - " tokenizer_3=pipe.tokenizer_3,\n", - " scheduler=pipe.scheduler,\n", - " transformer=pipe.transformer,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ea5c36ba", - "metadata": {}, - "outputs": [], - "source": [ - "# VSF\n", - "prompt = \"A poker table is set in the casino room, green felt stretched tight over the oval surface.\"\n", - "negative_prompt = \"cards\"\n", - "image_ours = pipe(\n", - " prompt=prompt,\n", - " negative_prompt=negative_prompt,\n", - " guidance_scale=0.0,\n", - " num_inference_steps=8,\n", - " scale=3.5,\n", - " offset=0.1\n", - ").images[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "95d240ea", - "metadata": {}, - "outputs": [], - "source": [ - "# NAG\n", - "image_nag = pipe_nag(\n", - " prompt,\n", - " nag_negative_prompt=negative_prompt,\n", - " guidance_scale=0.,\n", - " nag_scale=5,\n", - " nag_alpha=0.7,\n", - " nag_tau=6.5,\n", - " num_inference_steps=8,\n", - ").images[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b64a8d8b", - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "plt.rcParams['figure.figsize'] = (10, 5)\n", - "plt.subplot(1, 2, 1)\n", - "plt.imshow(image_ours)\n", - "plt.title(\"VSF\")\n", - "plt.axis(\"off\")\n", - "plt.subplot(1, 2, 2)\n", - "plt.imshow(image_nag)\n", - "plt.title(\"NAG\")\n", - "plt.axis(\"off\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "52e3b317", - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "4633fc4844fd4b928ea45a93f812d4c3", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Loading pipeline components...: 0%| | 0/7 [00:00" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "prompt = \"a chef cat making a cake in the kitchen, the kitchen is modern and well-lit, the text on cake is saying 'I LOVE AI, the whole image is in oil paint style'\"\n", - "image = pipe(\n", - " prompt,\n", - " negative_prompt=\"icing\",\n", - " guidance_scale=0.0,\n", - " num_inference_steps=3,\n", - " max_sequence_length=256,\n", - " scale=4,\n", - " generator=torch.Generator(\"cpu\").manual_seed(0)\n", - ").images[0]\n", - "image" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f8f04c4d", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "torch.Size([1, 256, 4096]) torch.Size([256, 3]) torch.Size([1, 4, 4096]) torch.Size([4, 3])\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "367f140676f3496b842d0bd19c67e72b", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/8 [00:00 54\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[43mpipe\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 55\u001b[0m \u001b[43m \u001b[49m\u001b[43mprompt_embeds\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mprompt_embeds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 56\u001b[0m \u001b[43m \u001b[49m\u001b[43mnegative_prompt\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mneg_prompt\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 57\u001b[0m \u001b[43m \u001b[49m\u001b[43mheight\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mheight\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 58\u001b[0m \u001b[43m \u001b[49m\u001b[43mwidth\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mwidth\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 59\u001b[0m \u001b[43m \u001b[49m\u001b[43mnum_frames\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mframes\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 60\u001b[0m \u001b[43m \u001b[49m\u001b[43mnum_inference_steps\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m12\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 61\u001b[0m \u001b[43m \u001b[49m\u001b[43mguidance_scale\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.0\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m 62\u001b[0m \u001b[43m \u001b[49m\u001b[43mgenerator\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mGenerator\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdevice\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcuda\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmanual_seed\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m42\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 63\u001b[0m \u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mframes[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 64\u001b[0m export_to_video(output, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mvsf.mp4\u001b[39m\u001b[38;5;124m\"\u001b[39m, fps\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m15\u001b[39m)\n", - "File \u001b[0;32m~/.local/lib/python3.10/site-packages/torch/utils/_contextlib.py:116\u001b[0m, in \u001b[0;36mcontext_decorator..decorate_context\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 113\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[1;32m 114\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecorate_context\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 115\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m ctx_factory():\n\u001b[0;32m--> 116\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/turbo/VSF/vsfwan/pipeline.py:547\u001b[0m, in \u001b[0;36mWanPipeline.__call__\u001b[0;34m(self, prompt, negative_prompt, height, width, num_frames, num_inference_steps, guidance_scale, num_videos_per_prompt, generator, latents, prompt_embeds, negative_prompt_embeds, output_type, return_dict, attention_kwargs, callback_on_step_end, callback_on_step_end_tensor_inputs, max_sequence_length)\u001b[0m\n\u001b[1;32m 544\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(block\u001b[38;5;241m.\u001b[39mattn2, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mprocessor\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m 545\u001b[0m block\u001b[38;5;241m.\u001b[39mattn2\u001b[38;5;241m.\u001b[39mprocessor\u001b[38;5;241m.\u001b[39mpos \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 547\u001b[0m noise_pred \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtransformer\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 548\u001b[0m \u001b[43m \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlatent_model_input\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 549\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimestep\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimestep\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 550\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mprompt_embeds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 551\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_kwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 552\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 553\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 555\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdo_classifier_free_guidance:\n\u001b[1;32m 556\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m block \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtransformer\u001b[38;5;241m.\u001b[39mblocks:\n", - "File \u001b[0;32m~/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1751\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1749\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1750\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1751\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1762\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1757\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1758\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1759\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1760\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1761\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1762\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1764\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1765\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n", - "File \u001b[0;32m~/miniconda/envs/neg/lib/python3.10/site-packages/diffusers/models/transformers/transformer_wan.py:469\u001b[0m, in \u001b[0;36mWanTransformer3DModel.forward\u001b[0;34m(self, hidden_states, timestep, encoder_hidden_states, encoder_hidden_states_image, return_dict, attention_kwargs)\u001b[0m\n\u001b[1;32m 465\u001b[0m post_patch_width \u001b[38;5;241m=\u001b[39m width \u001b[38;5;241m/\u001b[39m\u001b[38;5;241m/\u001b[39m p_w\n\u001b[1;32m 467\u001b[0m rotary_emb \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrope(hidden_states)\n\u001b[0;32m--> 469\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpatch_embedding\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 470\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m hidden_states\u001b[38;5;241m.\u001b[39mflatten(\u001b[38;5;241m2\u001b[39m)\u001b[38;5;241m.\u001b[39mtranspose(\u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m2\u001b[39m)\n\u001b[1;32m 472\u001b[0m temb, timestep_proj, encoder_hidden_states, encoder_hidden_states_image \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcondition_embedder(\n\u001b[1;32m 473\u001b[0m timestep, encoder_hidden_states, encoder_hidden_states_image\n\u001b[1;32m 474\u001b[0m )\n", - "File \u001b[0;32m~/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1751\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1749\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1750\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1751\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1762\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1757\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1758\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1759\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1760\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1761\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1762\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1764\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1765\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n", - "File \u001b[0;32m~/.local/lib/python3.10/site-packages/torch/nn/modules/conv.py:725\u001b[0m, in \u001b[0;36mConv3d.forward\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m 724\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m: Tensor) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tensor:\n\u001b[0;32m--> 725\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_conv_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbias\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/.local/lib/python3.10/site-packages/torch/nn/modules/conv.py:720\u001b[0m, in \u001b[0;36mConv3d._conv_forward\u001b[0;34m(self, input, weight, bias)\u001b[0m\n\u001b[1;32m 708\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpadding_mode \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mzeros\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 709\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m F\u001b[38;5;241m.\u001b[39mconv3d(\n\u001b[1;32m 710\u001b[0m F\u001b[38;5;241m.\u001b[39mpad(\n\u001b[1;32m 711\u001b[0m \u001b[38;5;28minput\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_reversed_padding_repeated_twice, mode\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpadding_mode\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 718\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgroups,\n\u001b[1;32m 719\u001b[0m )\n\u001b[0;32m--> 720\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mF\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconv3d\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 721\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbias\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstride\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpadding\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdilation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgroups\u001b[49m\n\u001b[1;32m 722\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[0;31mOutOfMemoryError\u001b[0m: CUDA out of memory. Tried to allocate 96.00 MiB. GPU 2 has a total capacity of 47.53 GiB of which 69.06 MiB is free. Process 2991192 has 2.57 GiB memory in use. Including non-PyTorch memory, this process has 44.87 GiB memory in use. Of the allocated memory 43.97 GiB is allocated by PyTorch, and 602.07 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)" - ] - } - ], - "source": [ - "import torch\n", - "from diffusers import AutoencoderKLWan\n", - "from vsfwan.pipeline import WanPipeline\n", - "from vsfwan.processor import WanAttnProcessor2_0\n", - "from diffusers import WanVACEPipeline\n", - "from diffusers.utils import export_to_video\n", - "\n", - "model_id = \"Wan-AI/Wan2.1-T2V-1.3B-Diffusers\"\n", - "vae = AutoencoderKLWan.from_pretrained(model_id, subfolder=\"vae\", torch_dtype=torch.float32)\n", - "pipe = WanPipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)\n", - "pipe.load_lora_weights(\n", - " \"Kijai/WanVideo_comfy\",\n", - " weight_name=\"Wan21_CausVid_bidirect2_T2V_1_3B_lora_rank32.safetensors\",\n", - " adapter_name=\"lora\"\n", - ") \n", - "pipe = pipe.to(\"cuda\")\n", - "\n", - "# prompt = \"A chef cat and a dog baking a cake together in a kitchen. The cat is carefully measuring flour, while the dog is stirring the batter with a wooden spoon. The cat is wearing a chef suit\"\n", - "# neg_prompt = \"chef hat\"\n", - "prompt = \"A cessna flying over a snowy mountain landscape, with a clear blue sky and fluffy white clouds. The plane is flying at a low altitude, casting a shadow on the snow-covered ground below. The mountains are rugged and steep, with patches of evergreen trees visible in the foreground.\"\n", - "neg_prompt = \"trees\"\n", - "\n", - "neg_prompt_embeds, _ = pipe.encode_prompt(\n", - " prompt=neg_prompt,\n", - " padding=False,\n", - " do_classifier_free_guidance=False,\n", - ")\n", - "\n", - "pos_prompt_embeds, _ = pipe.encode_prompt( \n", - " prompt=prompt,\n", - " do_classifier_free_guidance=False, \n", - " max_sequence_length=512 - neg_prompt_embeds.shape[1],\n", - ")\n", - "\n", - "\n", - "\n", - "neg_len = neg_prompt_embeds.shape[1]\n", - "pos_len = pos_prompt_embeds.shape[1]\n", - "print(neg_len, pos_len)\n", - "\n", - "\n", - "img_len = (height//8) * (width//8) * 3 * (frames // 4 + 1) // 12\n", - "print(img_len)\n", - "mask = torch.zeros((1, img_len, pos_len+neg_len)).cuda()\n", - "mask[:, :, -neg_len:] = -0.2 # this should be negative\n", - "\n", - "for block in pipe.transformer.blocks: \n", - " block.attn2.processor = WanAttnProcessor2_0(scale=1.7, neg_prompt_length=neg_len, attn_mask=mask)\n", - "\n", - "prompt_embeds = torch.cat([pos_prompt_embeds, neg_prompt_embeds], dim=1)\n", - "\n", - "output = pipe(\n", - " prompt_embeds=prompt_embeds,\n", - " negative_prompt=neg_prompt,\n", - " height=height,\n", - " width=width,\n", - " num_frames=frames,\n", - " num_inference_steps=8,\n", - " guidance_scale=0.0, \n", - " generator=torch.Generator(device=\"cuda\").manual_seed(42),\n", - ").frames[0]\n", - "export_to_video(output, \"vsf.mp4\", fps=15)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4fba4a3b", - "metadata": {}, - "outputs": [ - { - "ename": "", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[1;31mRunning cells with 'base (Python 3.13.5)' requires the ipykernel package.\n", - "\u001b[1;31mCreate a Python Environment with the required packages.\n", - "\u001b[1;31mOr install 'ipykernel' using the command: 'conda install -n base ipykernel --update-deps --force-reinstall'" - ] - } - ], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "base", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.13.5" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}