Spaces:

awacke1
/

Gradio-Med-Law-Fin-Scene-Gemini

Sleeping

App Files Files Community

awacke1 commited on Jul 19

Commit

c42c89f

verified ·

1 Parent(s): c2cabbd

Create app.py.v1

Browse files

Files changed (1) hide show

app.py.v1 +324 -0

app.py.v1 ADDED Viewed

	@@ -0,0 +1,324 @@

+# app.py
+import gradio as gr
+import pandas as pd
+import requests
+import io
+import dask.dataframe as dd
+from datasets import load_dataset, Image
+from mlcroissant import Dataset as CroissantDataset
+from huggingface_hub import get_token
+import polars as pl
+import warnings
+import traceback
+# 🤫 Let's ignore those pesky warnings, shall we?
+warnings.filterwarnings("ignore")
+# --- ⚙️ Configuration & Constants ---
+# 🎨 Let's give our datasets some personality with emojis and names!
+DATASET_CONFIG = {
+    "caselaw": {
+        "name": "common-pile/caselaw_access_project",
+        "emoji": "⚖️",
+        "search_col": "text",
+        "methods": ["💨 API (requests)", "🧊 Dask", "🥐 Croissant"],
+        "is_public": True,
+    },
+    "prompts": {
+        "name": "fka/awesome-chatgpt-prompts",
+        "emoji": "🤖",
+        "search_col": ["act", "prompt"],
+        "methods": ["🐼 Pandas", "💨 API (requests)", "🥐 Croissant"],
+        "is_public": True,
+    },
+    "finance": {
+        "name": "snorkelai/agent-finance-reasoning",
+        "emoji": "💰",
+        "search_col": ["question", "answer"],
+        "methods": ["🐼 Pandas", "🧊 Polars", "💨 API (requests)", "🥐 Croissant"],
+        "is_public": False,
+    },
+    "medical": {
+        "name": "FreedomIntelligence/medical-o1-reasoning-SFT",
+        "emoji": "🩺",
+        "search_col": "conversations",
+        "methods": ["🐼 Pandas", "🧊 Polars", "💨 API (requests)", "🥐 Croissant"],
+        "is_public": False,
+    },
+    "inscene": {
+        "name": "peteromallet/InScene-Dataset",
+        "emoji": "🖼️",
+        "search_col": "text",
+        "methods": ["🤗 Datasets", "🐼 Pandas", "🧊 Polars", "💨 API (requests)", "🥐 Croissant"],
+        "is_public": False,
+    },
+}
+# --- 헬 Helpers & Utility Functions ---
+def get_auth_headers():
+    """🔑 Creates authorization headers if a Hugging Face token is available."""
+    token = get_token()
+    return {"Authorization": f"Bearer {token}"} if token else {}
+def dataframe_to_outputs(df: pd.DataFrame):
+    """
+    📜 Takes a DataFrame and magically transforms it into various formats for your viewing pleasure.
+    Like a data chameleon!
+    """
+    if df.empty:
+        return "No results found. 🤷", None, None, "No results to copy."
+    df_str = df.astype(str)
+    markdown_output = df_str.to_markdown(index=False)
+    csv_buffer = io.StringIO()
+    df.to_csv(csv_buffer, index=False)
+    csv_buffer.seek(0)
+    excel_buffer = io.BytesIO()
+    df.to_excel(excel_buffer, index=False, engine='openpyxl')
+    excel_buffer.seek(0)
+    tab_delimited_output = df.to_csv(sep='\t', index=False)
+    return markdown_output, gr.File.from_bytes(csv_buffer.getvalue(), "results.csv"), gr.File.from_bytes(excel_buffer.getvalue(), "results.xlsx"), tab_delimited_output
+def handle_error(e: Exception):
+    """
+    😱 Oh no! An error! This function catches it and displays it nicely.
+    Because even errors deserve to look good.
+    """
+    error_message = f"🚨 An error occurred: {str(e)}\n\n"
+    auth_tip = "🔑 For gated datasets, did you log in? Try `huggingface-cli login` in your terminal."
+    full_trace = traceback.format_exc()
+    print(full_trace)
+    if "401" in str(e) or "Gated" in str(e):
+        error_message += auth_tip
+    return (
+        pd.DataFrame(),
+        gr.Gallery(None, label="🖼️ Image Results"),
+        f"```\n{error_message}\n\n{full_trace}\n```",
+        None,
+        None,
+        error_message,
+        f"```python\n# 🚨 Error during code generation:\n# {e}\n```"
+    )
+# --- 🎣 Data Fetching & Processing Functions ---
+def fetch_data(dataset_key: str, access_method: str, query: str):
+    """
+    🚀 The main mission control function! It fetches, searches, and formats data.
+    It's the brains of the operation.
+    """
+    try:
+        config = DATASET_CONFIG[dataset_key]
+        repo_id = config["name"]
+        search_cols = [config["search_col"]] if isinstance(config["search_col"], str) else config["search_col"]
+        df = pd.DataFrame()
+        code_snippet = ""
+        if "API" in access_method:
+            url = f"https://datasets-server.huggingface.co/rows?dataset={repo_id}&config=default&split=train&offset=0&length=100"
+            headers = get_auth_headers() if not config["is_public"] else {}
+            response = requests.get(url, headers=headers)
+            response.raise_for_status()
+            data = response.json()
+            df = pd.json_normalize(data['rows'], record_path='row', meta=['row_idx', 'truncated_cells'])
+            df = df.drop(columns=['row_idx', 'truncated_cells'], errors='ignore')
+            code_snippet = f"""
+# 💻 Generated Code: API (requests)
+import requests
+import pandas as pd
+# For gated datasets, get your token from https://huggingface.co/settings/tokens
+# Make sure to `huggingface-cli login` first.
+headers = {{"Authorization": "Bearer YOUR_HF_TOKEN"}}
+url = "{url}"
+response = requests.get(url, headers=headers) # Pass headers for gated datasets
+data = response.json()
+df = pd.json_normalize(data['rows'], record_path='row')
+print(df.head())
+"""
+        elif "Pandas" in access_method:
+            file_path = f"hf://datasets/{repo_id}/"
+            if repo_id == "fka/awesome-chatgpt-prompts":
+                file_path += "prompts.csv"
+                df = pd.read_csv(file_path)
+            else:
+                try:
+                    df = pd.read_parquet(f"{file_path}data/train-00000-of-00001.parquet")
+                except:
+                     try:
+                         df = pd.read_parquet(f"{file_path}train.parquet")
+                     except:
+                         df = pd.read_json(f"{file_path}medical_o1_sft.json")
+            code_snippet = f"""
+# 💻 Generated Code: Pandas
+import pandas as pd
+# Make sure to `huggingface-cli login` for gated datasets.
+file_path = "{file_path}"
+df = pd.{'read_csv' if '.csv' in file_path else ('read_json' if '.json' in file_path else 'read_parquet')}(file_path)
+print(df.head())
+"""
+        elif "Polars" in access_method:
+            file_path = f"hf://datasets/{repo_id}/"
+            try:
+                df = pl.read_parquet(f"{file_path}data/train-00000-of-00001.parquet").to_pandas()
+            except:
+                try:
+                    df = pl.read_parquet(f"{file_path}train.parquet").to_pandas()
+                except:
+                    df = pl.read_json(f"{file_path}medical_o1_sft.json").to_pandas()
+            code_snippet = f"""
+# 💻 Generated Code: Polars
+import polars as pl
+# Make sure to `huggingface-cli login` for gated datasets.
+file_path = "{'hf://datasets/' + repo_id + '/data/train-00000-of-00001.parquet'}"
+df = pl.read_parquet(file_path)
+print(df.head())
+"""
+        elif "Datasets" in access_method:
+            ds = load_dataset(repo_id, split='train[:100]')
+            df = ds.to_pandas()
+            code_snippet = f"""
+# 💻 Generated Code: Datasets
+from datasets import load_dataset
+# Make sure to `huggingface-cli login` for gated datasets.
+ds = load_dataset("{repo_id}", split='train')
+print(ds)
+"""
+        elif "Dask" in access_method:
+            df = dd.read_json(f"hf://datasets/{repo_id}/**/*.jsonl.gz").head(100)
+            code_snippet = f"""
+# 💻 Generated Code: Dask
+import dask.dataframe as dd
+# Make sure to `huggingface-cli login` for gated datasets.
+ddf = dd.read_json("hf://datasets/{repo_id}/**/*.jsonl.gz")
+print(ddf.head())
+"""
+        elif "Croissant" in access_method:
+            headers = get_auth_headers() if not config["is_public"] else {}
+            jsonld_url = f"https://huggingface.co/api/datasets/{repo_id}/croissant"
+            jsonld = requests.get(jsonld_url, headers=headers).json()
+            ds = CroissantDataset(jsonld=jsonld)
+            records = ds.records("default")
+            data_rows = [row for _, row in zip(range(100), records)]
+            df = pd.DataFrame(data_rows)
+            code_snippet = f"""
+# 💻 Generated Code: Croissant
+import requests
+from mlcroissant import Dataset as CroissantDataset
+import pandas as pd
+# For gated datasets, get your token from https://huggingface.co/settings/tokens
+headers = {{"Authorization": "Bearer YOUR_HF_TOKEN"}}
+jsonld_url = "{jsonld_url}"
+jsonld = requests.get(jsonld_url, headers=headers).json()
+ds = CroissantDataset(jsonld=jsonld)
+records = ds.records("default") # This is a generator
+# To preview data:
+preview_rows = [row for _, row in zip(range(100), records)]
+df = pd.DataFrame(preview_rows)
+print(df.head())
+"""
+        # --- 🔍 Universal Search Logic ---
+        if query and not df.empty:
+            if dataset_key == 'medical':
+                 df = df[df['conversations'].apply(lambda x: isinstance(x, list) and len(x) > 1 and query.lower() in str(x[1].get('value', '')).lower())]
+            else:
+                combined_mask = pd.Series([False] * len(df))
+                for col in search_cols:
+                    if col in df.columns and pd.api.types.is_string_dtype(df[col]):
+                        combined_mask |= df[col].str.contains(query, case=False, na=False)
+                df = df[combined_mask]
+        # --- 🖼️ Special Image Handling ---
+        gallery_output = None
+        if dataset_key == 'inscene' and not df.empty:
+            gallery_data = []
+            for _, row in df.iterrows():
+                if isinstance(row.get('image'), Image.Image):
+                    gallery_data.append((row['image'], row.get('text', '')))
+            gallery_output = gr.Gallery(gallery_data, label="🖼️ Image Results", height=400)
+        md, csv, xlsx, tab = dataframe_to_outputs(df)
+        return df, gallery_output, md, csv, xlsx, tab, code_snippet
+    except Exception as e:
+        return handle_error(e)
+# --- 🖼️ UI Generation ---
+def create_dataset_tab(dataset_key: str):
+    """
+    🏗️ This function builds a whole tab in our UI for a single dataset.
+    It's like a little construction worker for Gradio interfaces.
+    """
+    config = DATASET_CONFIG[dataset_key]
+    with gr.Tab(f"{config['emoji']} {dataset_key.capitalize()}"):
+        gr.Markdown(f"## {config['emoji']} Query the `{config['name']}` Dataset")
+        if not config['is_public']:
+            gr.Markdown("**Note:** This is a gated dataset. Please log in via `huggingface-cli login` in your terminal first.")
+        with gr.Row():
+            access_method = gr.Radio(config['methods'], label="🔑 Access Method", value=config['methods'][0])
+            query = gr.Textbox(label="🔍 Search Query", placeholder="Enter a keyword to search...")
+        fetch_button = gr.Button("🚀 Go Fetch!")
+        df_output = gr.DataFrame(label="📊 Results DataFrame", interactive=False, wrap=True)
+        gallery_output = gr.Gallery(visible=(dataset_key == 'inscene'), label="🖼️ Image Results")
+        with gr.Accordion("📂 View/Export Full Results", open=False):
+            markdown_output = gr.Markdown(label="📝 Markdown View")
+            with gr.Row():
+                csv_output = gr.File(label="⬇️ Download CSV")
+                xlsx_output = gr.File(label="⬇️ Download XLSX")
+            # CHANGED: Removed the language parameter entirely for maximum compatibility.
+            copy_output = gr.Code(label="📋 Copy-Paste (Tab-Delimited)")
+        code_output = gr.Code(label="💻 Python Code Snippet", language="python")
+        fetch_button.click(
+            fn=fetch_data,
+            inputs=[gr.State(dataset_key), access_method, query],
+            outputs=[df_output, gallery_output, markdown_output, csv_output, xlsx_output, copy_output, code_output]
+        )
+# --- 🚀 Main App ---
+with gr.Blocks(theme=gr.themes.Soft(), title="Hugging Face Dataset Explorer") as demo:
+    gr.Markdown("# 🤗 Hugging Face Dataset Explorer")
+    gr.Markdown(
+        "Select a dataset, choose an access method, type a query, and see the results instantly. "
+        "The app demonstrates various ways to access and search Hugging Face datasets and generates the code for you!"
+    )
+    with gr.Tabs():
+        for key in DATASET_CONFIG.keys():
+            create_dataset_tab(key)
+if __name__ == "__main__":
+    demo.launch(debug=True)