Spaces:
Runtime error
Runtime error
dung-vpt-uney
commited on
Commit
·
849a3f2
1
Parent(s):
78a732a
Update Visual-CoT demo - 2025-10-12 22:59:40
Browse filesFixes:
- Fix LLaVA config registration error (compatibility with newer transformers)
- Update Gradio to latest version (security fixes)
- Auto-deployed via update script
app.py
CHANGED
|
@@ -415,7 +415,7 @@ def create_demo():
|
|
| 415 |
# ============================================================
|
| 416 |
# Tab 1: Interactive Demo
|
| 417 |
# ============================================================
|
| 418 |
-
with gr.Tab("
|
| 419 |
gr.Markdown("""
|
| 420 |
### Try Visual-CoT with Your Own Images!
|
| 421 |
|
|
@@ -481,7 +481,7 @@ def create_demo():
|
|
| 481 |
)
|
| 482 |
|
| 483 |
with gr.Group():
|
| 484 |
-
gr.Markdown("####
|
| 485 |
image_output = gr.Image(
|
| 486 |
label="Image with Bounding Box",
|
| 487 |
type="pil",
|
|
@@ -520,7 +520,7 @@ def create_demo():
|
|
| 520 |
# ============================================================
|
| 521 |
# Tab 2: Benchmark Explorer
|
| 522 |
# ============================================================
|
| 523 |
-
with gr.Tab("
|
| 524 |
gr.Markdown("""
|
| 525 |
### Explore Visual-CoT Benchmark Examples
|
| 526 |
|
|
@@ -568,9 +568,9 @@ def create_demo():
|
|
| 568 |
# ============================================================
|
| 569 |
# Tab 3: About & Paper
|
| 570 |
# ============================================================
|
| 571 |
-
with gr.Tab("
|
| 572 |
gr.Markdown("""
|
| 573 |
-
##
|
| 574 |
|
| 575 |
**Title:** Visual CoT: Advancing Multi-Modal Language Models with a Comprehensive Dataset and Benchmark for Chain-of-Thought Reasoning
|
| 576 |
|
|
@@ -586,35 +586,27 @@ def create_demo():
|
|
| 586 |
|
| 587 |
---
|
| 588 |
|
| 589 |
-
##
|
| 590 |
|
| 591 |
```
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
|
| 595 |
-
|
| 596 |
-
|
| 597 |
-
|
| 598 |
-
|
| 599 |
-
|
| 600 |
-
|
| 601 |
-
|
| 602 |
-
|
| 603 |
-
|
| 604 |
-
|
| 605 |
-
│ │ Step 1: ROI │ → Bounding Box │
|
| 606 |
-
│ └──────────────┘ │
|
| 607 |
-
│ ↓ │
|
| 608 |
-
│ ┌──────────────┐ │
|
| 609 |
-
│ │ Step 2: QA │ → Final Answer │
|
| 610 |
-
│ └──────────────┘ │
|
| 611 |
-
│ │
|
| 612 |
-
└─────────────────────────────────────┘
|
| 613 |
```
|
| 614 |
|
| 615 |
---
|
| 616 |
|
| 617 |
-
##
|
| 618 |
|
| 619 |
- **Detection Accuracy**: 75.3% (IoU > 0.5)
|
| 620 |
- **Answer Accuracy**: 82.7% (GPT-3.5 evaluated)
|
|
@@ -624,13 +616,13 @@ def create_demo():
|
|
| 624 |
|
| 625 |
---
|
| 626 |
|
| 627 |
-
##
|
| 628 |
|
| 629 |
-
-
|
| 630 |
-
-
|
| 631 |
-
-
|
| 632 |
-
-
|
| 633 |
-
-
|
| 634 |
- [VisCoT-7b-224](https://huggingface.co/deepcs233/VisCoT-7b-224)
|
| 635 |
- [VisCoT-7b-336](https://huggingface.co/deepcs233/VisCoT-7b-336)
|
| 636 |
- [VisCoT-13b-224](https://huggingface.co/deepcs233/VisCoT-13b-224)
|
|
@@ -638,7 +630,7 @@ def create_demo():
|
|
| 638 |
|
| 639 |
---
|
| 640 |
|
| 641 |
-
##
|
| 642 |
|
| 643 |
If you find our work useful, please cite:
|
| 644 |
|
|
@@ -653,7 +645,7 @@ def create_demo():
|
|
| 653 |
|
| 654 |
---
|
| 655 |
|
| 656 |
-
##
|
| 657 |
|
| 658 |
- **Code**: Apache License 2.0
|
| 659 |
- **Dataset**: Research use only
|
|
@@ -661,7 +653,7 @@ def create_demo():
|
|
| 661 |
|
| 662 |
---
|
| 663 |
|
| 664 |
-
##
|
| 665 |
|
| 666 |
This work is built upon:
|
| 667 |
- [LLaVA](https://github.com/haotian-liu/LLaVA) - Base architecture
|
|
@@ -674,8 +666,7 @@ def create_demo():
|
|
| 674 |
gr.Markdown("""
|
| 675 |
---
|
| 676 |
<div style="text-align: center; color: #666; padding: 20px;">
|
| 677 |
-
<p
|
| 678 |
-
<p>Made with ❤️ by the Visual-CoT Team</p>
|
| 679 |
</div>
|
| 680 |
""")
|
| 681 |
|
|
|
|
| 415 |
# ============================================================
|
| 416 |
# Tab 1: Interactive Demo
|
| 417 |
# ============================================================
|
| 418 |
+
with gr.Tab("Interactive Demo"):
|
| 419 |
gr.Markdown("""
|
| 420 |
### Try Visual-CoT with Your Own Images!
|
| 421 |
|
|
|
|
| 481 |
)
|
| 482 |
|
| 483 |
with gr.Group():
|
| 484 |
+
gr.Markdown("#### Visualization")
|
| 485 |
image_output = gr.Image(
|
| 486 |
label="Image with Bounding Box",
|
| 487 |
type="pil",
|
|
|
|
| 520 |
# ============================================================
|
| 521 |
# Tab 2: Benchmark Explorer
|
| 522 |
# ============================================================
|
| 523 |
+
with gr.Tab("Benchmark Explorer"):
|
| 524 |
gr.Markdown("""
|
| 525 |
### Explore Visual-CoT Benchmark Examples
|
| 526 |
|
|
|
|
| 568 |
# ============================================================
|
| 569 |
# Tab 3: About & Paper
|
| 570 |
# ============================================================
|
| 571 |
+
with gr.Tab("About"):
|
| 572 |
gr.Markdown("""
|
| 573 |
+
## Paper Information
|
| 574 |
|
| 575 |
**Title:** Visual CoT: Advancing Multi-Modal Language Models with a Comprehensive Dataset and Benchmark for Chain-of-Thought Reasoning
|
| 576 |
|
|
|
|
| 586 |
|
| 587 |
---
|
| 588 |
|
| 589 |
+
## Model Architecture
|
| 590 |
|
| 591 |
```
|
| 592 |
+
Visual-CoT Pipeline:
|
| 593 |
+
|
| 594 |
+
Image Input
|
| 595 |
+
↓
|
| 596 |
+
CLIP ViT-L/14 (Vision Encoder)
|
| 597 |
+
↓
|
| 598 |
+
MLP Projector (2-layer)
|
| 599 |
+
↓
|
| 600 |
+
LLaMA/Vicuna (Language Model)
|
| 601 |
+
↓
|
| 602 |
+
Step 1: ROI Detection → Bounding Box
|
| 603 |
+
↓
|
| 604 |
+
Step 2: Question Answering → Final Answer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 605 |
```
|
| 606 |
|
| 607 |
---
|
| 608 |
|
| 609 |
+
## Key Results
|
| 610 |
|
| 611 |
- **Detection Accuracy**: 75.3% (IoU > 0.5)
|
| 612 |
- **Answer Accuracy**: 82.7% (GPT-3.5 evaluated)
|
|
|
|
| 616 |
|
| 617 |
---
|
| 618 |
|
| 619 |
+
## Resources
|
| 620 |
|
| 621 |
+
- **Paper**: [arXiv:2403.16999](https://arxiv.org/abs/2403.16999)
|
| 622 |
+
- **Code**: [GitHub](https://github.com/deepcs233/Visual-CoT)
|
| 623 |
+
- **Dataset**: [Hugging Face](https://huggingface.co/datasets/deepcs233/Visual-CoT)
|
| 624 |
+
- **Project Page**: [https://hao-shao.com/projects/viscot.html](https://hao-shao.com/projects/viscot.html)
|
| 625 |
+
- **Models**:
|
| 626 |
- [VisCoT-7b-224](https://huggingface.co/deepcs233/VisCoT-7b-224)
|
| 627 |
- [VisCoT-7b-336](https://huggingface.co/deepcs233/VisCoT-7b-336)
|
| 628 |
- [VisCoT-13b-224](https://huggingface.co/deepcs233/VisCoT-13b-224)
|
|
|
|
| 630 |
|
| 631 |
---
|
| 632 |
|
| 633 |
+
## Citation
|
| 634 |
|
| 635 |
If you find our work useful, please cite:
|
| 636 |
|
|
|
|
| 645 |
|
| 646 |
---
|
| 647 |
|
| 648 |
+
## License
|
| 649 |
|
| 650 |
- **Code**: Apache License 2.0
|
| 651 |
- **Dataset**: Research use only
|
|
|
|
| 653 |
|
| 654 |
---
|
| 655 |
|
| 656 |
+
## Acknowledgements
|
| 657 |
|
| 658 |
This work is built upon:
|
| 659 |
- [LLaVA](https://github.com/haotian-liu/LLaVA) - Base architecture
|
|
|
|
| 666 |
gr.Markdown("""
|
| 667 |
---
|
| 668 |
<div style="text-align: center; color: #666; padding: 20px;">
|
| 669 |
+
<p>Powered by <a href="https://huggingface.co/docs/hub/spaces-zerogpu">Zero GPU</a> on Hugging Face Spaces</p>
|
|
|
|
| 670 |
</div>
|
| 671 |
""")
|
| 672 |
|