From 3f1e3826888f63f952a5a73a32cc5e60549bd082 Mon Sep 17 00:00:00 2001 From: deepbuzin Date: Wed, 21 Aug 2024 10:40:37 +0000 Subject: [PATCH 1/9] Add agent outline --- agent.ipynb | 382 ++++++++++++++++++++++++++++++++++++++++++ example_gui.py | 428 +++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 1 + 3 files changed, 811 insertions(+) create mode 100644 agent.ipynb create mode 100644 example_gui.py diff --git a/agent.ipynb b/agent.ipynb new file mode 100644 index 0000000..1765ccb --- /dev/null +++ b/agent.ipynb @@ -0,0 +1,382 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from typing import TypedDict, Annotated, Sequence, List, Optional\n", + "import operator\n", + "\n", + "from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage\n", + "from langchain.pydantic_v1 import BaseModel, Field" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_openai import AzureChatOpenAI\n", + "\n", + "llm = AzureChatOpenAI(\n", + " temperature=0.0,\n", + " azure_deployment=\"gpt4o\",\n", + " openai_api_version=\"2023-07-01-preview\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 2. Create the state\n", + "\n", + "class AgentState(TypedDict):\n", + "\tsearch_queries: List[str]\n", + "\tvideo_ids: List[str]\n", + "\tclip_texts = List[str]\n", + "\tclues = List[str]\n", + "\tannotations = List[str]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 3. Set prompts\n", + "\n", + "GEN_QUERIES_PROMPT = (\n", + " \"You a helping the user to find a very large and diverse set of videos on a video hosting service.\",\n", + " \"A user will only describe which videos they are looking for and how many queries they need.\",\n", + ")\n", + "\n", + "# prompt='I want to find instructional videos about how to do squats.',\n", + "# num_queries_prompt = f'I need {num_queries} queries'\n", + "\n", + "EXTRACT_CLUES_PROMPT = \"\"\"You are a highly intelligent data investigator. \n", + "You take unstructured damaged data and look for clues that could help restore the initial information\n", + "and extract important insights from it.\n", + "You are the best one for this job in the world because you are a former detective. \n", + "You care about even the smallest details, and your guesses about what happened in the initial file\n", + "even at very limited inputs are usually absolutely right. \n", + "You use deductive and inductive reasoning at the highest possible quality.\n", + "\n", + "#YOUR TODAY'S JOB\n", + "The user needs to learn about what happens in a specific segment of a video file. Your job is to help the user by providing clues that would help the user make the right assumption.\n", + "The user will provide you with: \n", + "1. Instructions about what kind of information the user is trying to obtain.\n", + "2. A list of time codes of the segments in format \"-\". All the provided segment of the video contain what the user is looking for, but other parts of the video might have different content.\n", + "3. A transcript of the *full video* in format of \"\\\\n\"\n", + "\n", + "Your task:\n", + "1. Read the transcript.\n", + "2. Provide the clues in a given format.\n", + "3. Provied any other info requested by the user.\n", + "\n", + "#RULES\n", + "!!! VERY IMPORTANT !!!\n", + "1. Rely only on the data provided in the transcript. Do not improvise. All the quotes and corresponding timestamps must be taken from the transcript. Quote timestamps must be taken directly from the transcript.\n", + "2. Your job is to find the data already provided in the transcript.\n", + "3. Analyze every segment. Only skip a segment if there is no information about it in the trascript.\n", + "4. For local clues, make sure that the quotes that you provide are located inside the segment. To do this, double check the timestamps from the transcript and the segment.\n", + "5. For all clues, make sure that the quotes exactly correspond to the timestamps that you provide.\n", + "6. When making clues, try as much as possible to make them describe specifically what is shown in the segment.\n", + "7. Follow the format output.\n", + "8. Be very careful with details. Don't generalize. Always double check your results.\n", + "\n", + "Please, help the user find relevant clues to reconstruct the information they are looking for, for each provided segment.\n", + "\n", + "WHAT IS A CLUE: A *clue*, in the context of reconstructing narratives from damaged data, \n", + "is a fragment of information extracted from a corrupted or incomplete source that provides \n", + "insight into the original content. These fragments serve as starting points for inference \n", + "and deduction, allowing researchers to hypothesize about the fuller context or meaning of \n", + "the degraded material. The process of identifying and interpreting clues involves both objective analysis of the \n", + "available data and subjective extrapolation based on domain knowledge, contextual understanding, \n", + "and logical reasoning.\n", + "\n", + "Here is what the user expects to have from you:\n", + "1. *Local clues* that would help the user undestand how the thing they are looking for happens inside the segment. Local clues for a segment are generated from quotes inside a specific segment.\n", + "2. *Global clues* that would help the user understand how the thing they are looking for happens inside the segment. Global clues for a segment are generated from quotes all around the video, but are very relevant to the specific that they are provided for.\n", + "3. *Logical inferences* that could help the user understand how the thing they are looking for happens inside the segment. Logical inferences for a segment are deducted from local and global clues for this segment.\n", + "\n", + "!!!IT IS EXTREMELY IMPORTANT TO DELIVER ALL THREE THINGS!!!\n", + "\"\"\"\n", + "\n", + "# also MANY a structured output prompt\n", + "\n", + "# EXTRACT_CLUES_PROMPT = \"\"\"\n", + "# \"User's instructions: The provided video is a tutorial about how to perform squats.\n", + "\n", + "# I need to understand HOW THE PERSON SHOWN IN EACH SEGMENT PERFORMS SQUATS IN THIS SEGMENT.\n", + "# What is done correctly.\n", + "# What mistakes they make. Why these mistakes happen.\n", + "# How these mistakes could be improved.\n", + "\n", + "# It is very improtant that the information that you provide would describe how the person shown in the segment is doing squats, and not some generic advice that is unrelated to the visual information.\n", + "# \"\"\"\n", + "\n", + "# prompt.append('Segment timecodes and optional additional information:\\n' + '\\n'.join([s.to_str(skip=[filter_by] if filter_by else []) for s in video_segments_part]))\n", + "# prompt.append('Transcript:\\n' + transcript)\n", + "\n", + "\n", + "GEN_ANNOTATIONS_PROMPT = \"\"\"You are a helpful assistant that performs high quality data investigation and transformation.\n", + " You will be given a JSON object with clues and other helpful information about what's going on \n", + " in a specific part of a video file. This part is called a segment. Your job is to:\n", + " 1. Read this JSON object carefully\n", + " 2. Answer user's questions about this segment\n", + " 3. Provide the answer as a JSON object in a schema provided by the user\n", + " Important rules:\n", + " 1. You can only rely on data presented in a provided JSON object. Don't improvise.\n", + " 2. Follow user's request carefully.\n", + " 3. Don't rush to deliver the answer. Take some time to think. Make a deep breath. Then start writing.\n", + " 4. If you want to output field as empty (null), output it as JSON null (without quotes), not as a string \"null\". \n", + "\"\"\"\n", + "\n", + "\n", + "# human_prompt = \"\"\"\n", + "# You are given a JSON object that contains clues about segments of a video with timecodes.\n", + "# !!!! For each segment provided in a JSON object you need to answer on the following questions:\n", + "# 1. Given the data found in the JSON object, what is a probability that this part contains a footage of a person doing squats? [the answer could be only \"high\", \"medium\", \"low\", or null (if impossible to infer from the provided data)]\n", + "# 2. Given the data found in the JSON object and even if the answer on the previous question is \"low\", does this person do squats right, wrong, or mixed? [the answer could be only \"right\", \"wrong\", \"mixed\", or null (if impossible to infer from the provided data)]\n", + "# 3. Given the data found in the JSON object, what exactly does thing person do right and/or wrong regarding their squats technique? [the answer should be clear and focused on body parts]\n", + "# 4. If the answer on the previous question contains description of wrong technique, explain how to fix these mistakes using your \"own knowledge\" like you are a sports coach.\n", + "# \"\"\"\n", + "\n", + "# for clue in clues_part:\n", + "# prompt.append(\"Segment:\\n\" + json.dumps(clue))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from datagen import DatagenConfig, get_video_ids, download_videos, detect_segments_clip, generate_clues, generate_annotations\n", + "\n", + "config_params = {\n", + " \"openai\": {\n", + " \"type\": \"azure\", # openai/azure\n", + " \"temperature\": \"1\",\n", + " \"deployment\": \"gpt4o\", # model for openai / deployment for azure\n", + " },\n", + " \"data_dir\": \"./tmp/squats\",\n", + "}\n", + "\n", + "!mkdir -p {config_params[\"data_dir\"]}\n", + "\n", + "# this config handles all the bookeeping so you need to pass it everywhere.\n", + "config = DatagenConfig(**config_params)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 4. Create nodes\n", + "\n", + "\n", + "def gen_queries_node(state: AgentState):\n", + " class QueryList(BaseModel):\n", + " \"\"\"A list of queries to find videos on a video hosting service\"\"\"\n", + "\n", + " queries: list[str] = Field(default=None, description=\"a list of queries\")\n", + "\n", + " messages = [\n", + " SystemMessage(content=GEN_QUERIES_PROMPT),\n", + " HumanMessage(content=state[\"task\"]),\n", + " ]\n", + "\n", + " model = llm.with_structured_output(QueryList)\n", + " response = model.invoke(messages)\n", + "\n", + " return {\"search_queries\": response.content}\n", + "\n", + "\n", + "def get_video_ids_node(state: AgentState):\n", + " ids = get_video_ids(\n", + " state[\"search_queries\"],\n", + " config=config,\n", + " videos_per_query=2,\n", + " only_creative_commons=False,\n", + " )\n", + " return {\"video_ids\": ids}\n", + "\n", + "\n", + "def download_node(state: AgentState):\n", + " videos = download_videos(ids, config)\n", + " # save to state\n", + " return {\"something\": videos}\n", + "\n", + "\n", + "def detect_segments_node(state: AgentState):\n", + " segments = detect_segments_clip(\n", + " # video_ids=['KvRK5Owqzgw'],\n", + " text_prompts=\"a person doing squats\", # that's the text for CLIP to compare to images. You can provide a list of texts to use average distance.\n", + " model=model,\n", + " processor=processor,\n", + " fps_sampling=2, # the more fps, the more granular segment borders and more precise segments, at the cost of speed.\n", + " device=\"cuda\", # 'cpu' for local\n", + " frames_per_batch=100, # 100 frames use about 10GB GPU RAM, so batch to fill your GPU RAM.\n", + " config=config,\n", + " )\n", + " return {\"segments\": segments}\n", + "\n", + "\n", + "def extract_clues_node(state: AgentState):\n", + " clues = []\n", + "\n", + " clues = generate_clues(\n", + " # video_ids=['byxWus7BwfQ'],\n", + " config=config,\n", + " human_prompt=human_prompt,\n", + " segments_per_call=5, # the output might be quite long, so need to limit number of segments per gpt call to respect max output legnth\n", + " raise_on_error=True, # interrupt when encountering an error. Useful for debugging.\n", + " )\n", + "\n", + " return {\"clues\": clues}\n", + "\n", + "\n", + "def gen_annotations_node(state: AgentState):\n", + "\n", + " class SegmentFeedback(BaseModel):\n", + " \"\"\"\n", + " —> GOOD EXAMPLES:\n", + " \"wrong\":\"Knees caving in: This can stress the knees and reduce effectiveness\"\n", + " \"correction\":\"Focus on keeping knees aligned with your toes.\"\n", + " \"wrong\":\"Rounding the back: This increases the risk of back injuries\"\n", + " \"correction\":\"Keep your chest up and maintain a neutral spine throughout the movement.\"\n", + " \"wrong\":\"Heels are lifting off the ground: this shifts the weight forward, reducing stability\"\n", + " \"correction\":\" Keep your weight on your heels and press through them as you rise.\"\n", + " \"right\":\"Chest and shoulders: The chest is up, and the shoulders are back, maintaining an upright torso.\"\n", + " \"correction\":null\n", + " —> BAD EXAMPLES:\n", + " \"wrong\":\"knees\"\n", + " \"correction\":\"fix knees\"\n", + " \"wrong\":\"back looks funny\"\n", + " \"correction\":\"make back better\"\n", + " \"wrong\":\"feet are doing something\"\n", + " \"correction\":\"feet should be different\"\n", + " \"right\":\"arms\"\n", + " \"correction\":\"arms are fine i think\"\n", + " —> BAD EXAMPLES END HERE\n", + " \"\"\"\n", + "\n", + " right: Optional[str] = Field(description=\"what was right in the performance\")\n", + " wrong: Optional[str] = Field(description=\"what was wrong in the performance\")\n", + " correction: Optional[str] = Field(\n", + " description=\"how and in what ways it the performance could be improved\"\n", + " )\n", + "\n", + " # The segment timestamps are taken from the provided information.\n", + " class SegmentAnnotation(BaseModel):\n", + " squats_probability: Optional[str] = Field(\n", + " description=\"how high is the probability that the person is doing squats in the segment: low, medium, high, unknown(null)\"\n", + " )\n", + " squats_technique_correctness: Optional[str] = Field(\n", + " description=\"correctness of the squat technique.\"\n", + " )\n", + " squats_feedback: Optional[SegmentFeedback] = Field(\n", + " description=\"what was right and wrong in the squat perfomance in the segment. When the technique is incorrect, provide instructions how to correct them.\"\n", + " )\n", + "\n", + " annotations = generate_annotations(\n", + " human_prompt=human_prompt,\n", + " config=config,\n", + " segments_per_call=5,\n", + " annotation_schema=SegmentAnnotation,\n", + " )\n", + "\n", + " return {\"annotations\": annotations}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langgraph.graph import StateGraph, END\n", + "from typing import TypedDict, Annotated, List\n", + "import operator\n", + "from langgraph.checkpoint.memory import MemorySaver\n", + "\n", + "from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage, AIMessage, ChatMessage\n", + "\n", + "memory = MemorySaver()\n", + "# memory = SqliteSaver.from_conn_string(\":memory:\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "builder = StateGraph(AgentState)\n", + "\n", + "builder.add_node(\"generate_queries\", gen_queries_node)\n", + "builder.add_node(\"get_video_ids\", get_video_ids_node)\n", + "builder.add_node(\"download\", download_node)\n", + "builder.add_node(\"detect_segments\", detect_segments_node)\n", + "builder.add_node(\"extract_clues\", extract_clues_node)\n", + "builder.add_node(\"gen_annotations\", gen_annotations_node)\n", + "\n", + "builder.set_entry_point(\"generate_queries\")\n", + "\n", + "# builder.add_conditional_edges(\n", + "# \"generate\", \n", + "# should_continue, \n", + "# {END: END, \"reflect\": \"reflect\"}\n", + "# )\n", + "\n", + "builder.add_edge(\"generate_queries\", \"get_video_ids\")\n", + "builder.add_edge(\"get_video_ids\", \"download\")\n", + "builder.add_edge(\"download\", \"detect_segments\")\n", + "builder.add_edge(\"detect_segments\", \"extract_clues\")\n", + "builder.add_edge(\"extract_clues\", \"gen_annotations\")\n", + "builder.add_edge(\"gen_annotations\", END)\n", + "\n", + "graph = builder.compile(checkpointer=memory)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "thread = {\"configurable\": {\"thread_id\": \"1\"}}\n", + "for s in graph.stream(\n", + " {\n", + " \"task\": \"what is the difference between langchain and langsmith\",\n", + " },\n", + " thread,\n", + "):\n", + " print(s)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "datagen", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/example_gui.py b/example_gui.py new file mode 100644 index 0000000..8b32c72 --- /dev/null +++ b/example_gui.py @@ -0,0 +1,428 @@ +import warnings +warnings.filterwarnings("ignore", message=".*TqdmWarning.*") +from dotenv import load_dotenv + +_ = load_dotenv() + +from langgraph.graph import StateGraph, END +from typing import TypedDict, Annotated, List +import operator +from langgraph.checkpoint.sqlite import SqliteSaver +from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage, AIMessage, ChatMessage +from langchain_openai import ChatOpenAI +from langchain_core.pydantic_v1 import BaseModel +from tavily import TavilyClient +import os +import sqlite3 + +class AgentState(TypedDict): + task: str + lnode: str + plan: str + draft: str + critique: str + content: List[str] + queries: List[str] + revision_number: int + max_revisions: int + count: Annotated[int, operator.add] + + +class Queries(BaseModel): + queries: List[str] + +class ewriter(): + def __init__(self): + self.model = ChatOpenAI(model="gpt-3.5-turbo", temperature=0) + self.PLAN_PROMPT = ("You are an expert writer tasked with writing a high level outline of a short 3 paragraph essay. " + "Write such an outline for the user provided topic. Give the three main headers of an outline of " + "the essay along with any relevant notes or instructions for the sections. ") + self.WRITER_PROMPT = ("You are an essay assistant tasked with writing excellent 3 paragraph essays. " + "Generate the best essay possible for the user's request and the initial outline. " + "If the user provides critique, respond with a revised version of your previous attempts. " + "Utilize all the information below as needed: \n" + "------\n" + "{content}") + self.RESEARCH_PLAN_PROMPT = ("You are a researcher charged with providing information that can " + "be used when writing the following essay. Generate a list of search " + "queries that will gather " + "any relevant information. Only generate 3 queries max.") + self.REFLECTION_PROMPT = ("You are a teacher grading an 3 paragraph essay submission. " + "Generate critique and recommendations for the user's submission. " + "Provide detailed recommendations, including requests for length, depth, style, etc.") + self.RESEARCH_CRITIQUE_PROMPT = ("You are a researcher charged with providing information that can " + "be used when making any requested revisions (as outlined below). " + "Generate a list of search queries that will gather any relevant information. " + "Only generate 2 queries max.") + self.tavily = TavilyClient(api_key=os.environ["TAVILY_API_KEY"]) + builder = StateGraph(AgentState) + builder.add_node("planner", self.plan_node) + builder.add_node("research_plan", self.research_plan_node) + builder.add_node("generate", self.generation_node) + builder.add_node("reflect", self.reflection_node) + builder.add_node("research_critique", self.research_critique_node) + builder.set_entry_point("planner") + builder.add_conditional_edges( + "generate", + self.should_continue, + {END: END, "reflect": "reflect"} + ) + builder.add_edge("planner", "research_plan") + builder.add_edge("research_plan", "generate") + builder.add_edge("reflect", "research_critique") + builder.add_edge("research_critique", "generate") + memory = SqliteSaver(conn=sqlite3.connect(":memory:", check_same_thread=False)) + self.graph = builder.compile( + checkpointer=memory, + interrupt_after=['planner', 'generate', 'reflect', 'research_plan', 'research_critique'] + ) + + + def plan_node(self, state: AgentState): + messages = [ + SystemMessage(content=self.PLAN_PROMPT), + HumanMessage(content=state['task']) + ] + response = self.model.invoke(messages) + return {"plan": response.content, + "lnode": "planner", + "count": 1, + } + def research_plan_node(self, state: AgentState): + queries = self.model.with_structured_output(Queries).invoke([ + SystemMessage(content=self.RESEARCH_PLAN_PROMPT), + HumanMessage(content=state['task']) + ]) + content = state['content'] or [] # add to content + for q in queries.queries: + response = self.tavily.search(query=q, max_results=2) + for r in response['results']: + content.append(r['content']) + return {"content": content, + "queries": queries.queries, + "lnode": "research_plan", + "count": 1, + } + def generation_node(self, state: AgentState): + content = "\n\n".join(state['content'] or []) + user_message = HumanMessage( + content=f"{state['task']}\n\nHere is my plan:\n\n{state['plan']}") + messages = [ + SystemMessage( + content=self.WRITER_PROMPT.format(content=content) + ), + user_message + ] + response = self.model.invoke(messages) + return { + "draft": response.content, + "revision_number": state.get("revision_number", 1) + 1, + "lnode": "generate", + "count": 1, + } + def reflection_node(self, state: AgentState): + messages = [ + SystemMessage(content=self.REFLECTION_PROMPT), + HumanMessage(content=state['draft']) + ] + response = self.model.invoke(messages) + return {"critique": response.content, + "lnode": "reflect", + "count": 1, + } + def research_critique_node(self, state: AgentState): + queries = self.model.with_structured_output(Queries).invoke([ + SystemMessage(content=self.RESEARCH_CRITIQUE_PROMPT), + HumanMessage(content=state['critique']) + ]) + content = state['content'] or [] + for q in queries.queries: + response = self.tavily.search(query=q, max_results=2) + for r in response['results']: + content.append(r['content']) + return {"content": content, + "lnode": "research_critique", + "count": 1, + } + def should_continue(self, state): + if state["revision_number"] > state["max_revisions"]: + return END + return "reflect" + +import gradio as gr +import time + +class writer_gui( ): + def __init__(self, graph, share=False): + self.graph = graph + self.share = share + self.partial_message = "" + self.response = {} + self.max_iterations = 10 + self.iterations = [] + self.threads = [] + self.thread_id = -1 + self.thread = {"configurable": {"thread_id": str(self.thread_id)}} + #self.sdisps = {} #global + self.demo = self.create_interface() + + def run_agent(self, start,topic,stop_after): + #global partial_message, thread_id,thread + #global response, max_iterations, iterations, threads + if start: + self.iterations.append(0) + config = {'task': topic,"max_revisions": 2,"revision_number": 0, + 'lnode': "", 'planner': "no plan", 'draft': "no draft", 'critique': "no critique", + 'content': ["no content",], 'queries': "no queries", 'count':0} + self.thread_id += 1 # new agent, new thread + self.threads.append(self.thread_id) + else: + config = None + self.thread = {"configurable": {"thread_id": str(self.thread_id)}} + while self.iterations[self.thread_id] < self.max_iterations: + self.response = self.graph.invoke(config, self.thread) + self.iterations[self.thread_id] += 1 + self.partial_message += str(self.response) + self.partial_message += f"\n------------------\n\n" + ## fix + lnode,nnode,_,rev,acount = self.get_disp_state() + yield self.partial_message,lnode,nnode,self.thread_id,rev,acount + config = None #need + #print(f"run_agent:{lnode}") + if not nnode: + #print("Hit the end") + return + if lnode in stop_after: + #print(f"stopping due to stop_after {lnode}") + return + else: + #print(f"Not stopping on lnode {lnode}") + pass + return + + def get_disp_state(self,): + current_state = self.graph.get_state(self.thread) + lnode = current_state.values["lnode"] + acount = current_state.values["count"] + rev = current_state.values["revision_number"] + nnode = current_state.next + #print (lnode,nnode,self.thread_id,rev,acount) + return lnode,nnode,self.thread_id,rev,acount + + def get_state(self,key): + current_values = self.graph.get_state(self.thread) + if key in current_values.values: + lnode,nnode,self.thread_id,rev,astep = self.get_disp_state() + new_label = f"last_node: {lnode}, thread_id: {self.thread_id}, rev: {rev}, step: {astep}" + return gr.update(label=new_label, value=current_values.values[key]) + else: + return "" + + def get_content(self,): + current_values = self.graph.get_state(self.thread) + if "content" in current_values.values: + content = current_values.values["content"] + lnode,nnode,thread_id,rev,astep = self.get_disp_state() + new_label = f"last_node: {lnode}, thread_id: {self.thread_id}, rev: {rev}, step: {astep}" + return gr.update(label=new_label, value="\n\n".join(item for item in content) + "\n\n") + else: + return "" + + def update_hist_pd(self,): + #print("update_hist_pd") + hist = [] + # curiously, this generator returns the latest first + for state in self.graph.get_state_history(self.thread): + if state.metadata['step'] < 1: + continue + thread_ts = state.config['configurable']['thread_ts'] + tid = state.config['configurable']['thread_id'] + count = state.values['count'] + lnode = state.values['lnode'] + rev = state.values['revision_number'] + nnode = state.next + st = f"{tid}:{count}:{lnode}:{nnode}:{rev}:{thread_ts}" + hist.append(st) + return gr.Dropdown(label="update_state from: thread:count:last_node:next_node:rev:thread_ts", + choices=hist, value=hist[0],interactive=True) + + def find_config(self,thread_ts): + for state in self.graph.get_state_history(self.thread): + config = state.config + if config['configurable']['thread_ts'] == thread_ts: + return config + return(None) + + def copy_state(self,hist_str): + ''' result of selecting an old state from the step pulldown. Note does not change thread. + This copies an old state to a new current state. + ''' + thread_ts = hist_str.split(":")[-1] + #print(f"copy_state from {thread_ts}") + config = self.find_config(thread_ts) + #print(config) + state = self.graph.get_state(config) + self.graph.update_state(self.thread, state.values, as_node=state.values['lnode']) + new_state = self.graph.get_state(self.thread) #should now match + new_thread_ts = new_state.config['configurable']['thread_ts'] + tid = new_state.config['configurable']['thread_id'] + count = new_state.values['count'] + lnode = new_state.values['lnode'] + rev = new_state.values['revision_number'] + nnode = new_state.next + return lnode,nnode,new_thread_ts,rev,count + + def update_thread_pd(self,): + #print("update_thread_pd") + return gr.Dropdown(label="choose thread", choices=threads, value=self.thread_id,interactive=True) + + def switch_thread(self,new_thread_id): + #print(f"switch_thread{new_thread_id}") + self.thread = {"configurable": {"thread_id": str(new_thread_id)}} + self.thread_id = new_thread_id + return + + def modify_state(self,key,asnode,new_state): + ''' gets the current state, modifes a single value in the state identified by key, and updates state with it. + note that this will create a new 'current state' node. If you do this multiple times with different keys, it will create + one for each update. Note also that it doesn't resume after the update + ''' + current_values = self.graph.get_state(self.thread) + current_values.values[key] = new_state + self.graph.update_state(self.thread, current_values.values,as_node=asnode) + return + + + def create_interface(self): + with gr.Blocks(theme=gr.themes.Default(spacing_size='sm',text_size="sm")) as demo: + + def updt_disp(): + ''' general update display on state change ''' + current_state = self.graph.get_state(self.thread) + hist = [] + # curiously, this generator returns the latest first + for state in self.graph.get_state_history(self.thread): + if state.metadata['step'] < 1: #ignore early states + continue + s_thread_ts = state.config['configurable']['thread_ts'] + s_tid = state.config['configurable']['thread_id'] + s_count = state.values['count'] + s_lnode = state.values['lnode'] + s_rev = state.values['revision_number'] + s_nnode = state.next + st = f"{s_tid}:{s_count}:{s_lnode}:{s_nnode}:{s_rev}:{s_thread_ts}" + hist.append(st) + if not current_state.metadata: #handle init call + return{} + else: + return { + topic_bx : current_state.values["task"], + lnode_bx : current_state.values["lnode"], + count_bx : current_state.values["count"], + revision_bx : current_state.values["revision_number"], + nnode_bx : current_state.next, + threadid_bx : self.thread_id, + thread_pd : gr.Dropdown(label="choose thread", choices=self.threads, value=self.thread_id,interactive=True), + step_pd : gr.Dropdown(label="update_state from: thread:count:last_node:next_node:rev:thread_ts", + choices=hist, value=hist[0],interactive=True), + } + def get_snapshots(): + new_label = f"thread_id: {self.thread_id}, Summary of snapshots" + sstate = "" + for state in self.graph.get_state_history(self.thread): + for key in ['plan', 'draft', 'critique']: + if key in state.values: + state.values[key] = state.values[key][:80] + "..." + if 'content' in state.values: + for i in range(len(state.values['content'])): + state.values['content'][i] = state.values['content'][i][:20] + '...' + if 'writes' in state.metadata: + state.metadata['writes'] = "not shown" + sstate += str(state) + "\n\n" + return gr.update(label=new_label, value=sstate) + + def vary_btn(stat): + #print(f"vary_btn{stat}") + return(gr.update(variant=stat)) + + with gr.Tab("Agent"): + with gr.Row(): + topic_bx = gr.Textbox(label="Essay Topic", value="Pizza Shop") + gen_btn = gr.Button("Generate Essay", scale=0,min_width=80, variant='primary') + cont_btn = gr.Button("Continue Essay", scale=0,min_width=80) + with gr.Row(): + lnode_bx = gr.Textbox(label="last node", min_width=100) + nnode_bx = gr.Textbox(label="next node", min_width=100) + threadid_bx = gr.Textbox(label="Thread", scale=0, min_width=80) + revision_bx = gr.Textbox(label="Draft Rev", scale=0, min_width=80) + count_bx = gr.Textbox(label="count", scale=0, min_width=80) + with gr.Accordion("Manage Agent", open=False): + checks = list(self.graph.nodes.keys()) + checks.remove('__start__') + stop_after = gr.CheckboxGroup(checks,label="Interrupt After State", value=checks, scale=0, min_width=400) + with gr.Row(): + thread_pd = gr.Dropdown(choices=self.threads,interactive=True, label="select thread", min_width=120, scale=0) + step_pd = gr.Dropdown(choices=['N/A'],interactive=True, label="select step", min_width=160, scale=1) + live = gr.Textbox(label="Live Agent Output", lines=5, max_lines=5) + + # actions + sdisps =[topic_bx,lnode_bx,nnode_bx,threadid_bx,revision_bx,count_bx,step_pd,thread_pd] + thread_pd.input(self.switch_thread, [thread_pd], None).then( + fn=updt_disp, inputs=None, outputs=sdisps) + step_pd.input(self.copy_state,[step_pd],None).then( + fn=updt_disp, inputs=None, outputs=sdisps) + gen_btn.click(vary_btn,gr.Number("secondary", visible=False), gen_btn).then( + fn=self.run_agent, inputs=[gr.Number(True, visible=False),topic_bx,stop_after], outputs=[live],show_progress=True).then( + fn=updt_disp, inputs=None, outputs=sdisps).then( + vary_btn,gr.Number("primary", visible=False), gen_btn).then( + vary_btn,gr.Number("primary", visible=False), cont_btn) + cont_btn.click(vary_btn,gr.Number("secondary", visible=False), cont_btn).then( + fn=self.run_agent, inputs=[gr.Number(False, visible=False),topic_bx,stop_after], + outputs=[live]).then( + fn=updt_disp, inputs=None, outputs=sdisps).then( + vary_btn,gr.Number("primary", visible=False), cont_btn) + + with gr.Tab("Plan"): + with gr.Row(): + refresh_btn = gr.Button("Refresh") + modify_btn = gr.Button("Modify") + plan = gr.Textbox(label="Plan", lines=10, interactive=True) + refresh_btn.click(fn=self.get_state, inputs=gr.Number("plan", visible=False), outputs=plan) + modify_btn.click(fn=self.modify_state, inputs=[gr.Number("plan", visible=False), + gr.Number("planner", visible=False), plan],outputs=None).then( + fn=updt_disp, inputs=None, outputs=sdisps) + with gr.Tab("Research Content"): + refresh_btn = gr.Button("Refresh") + content_bx = gr.Textbox(label="content", lines=10) + refresh_btn.click(fn=self.get_content, inputs=None, outputs=content_bx) + with gr.Tab("Draft"): + with gr.Row(): + refresh_btn = gr.Button("Refresh") + modify_btn = gr.Button("Modify") + draft_bx = gr.Textbox(label="draft", lines=10, interactive=True) + refresh_btn.click(fn=self.get_state, inputs=gr.Number("draft", visible=False), outputs=draft_bx) + modify_btn.click(fn=self.modify_state, inputs=[gr.Number("draft", visible=False), + gr.Number("generate", visible=False), draft_bx], outputs=None).then( + fn=updt_disp, inputs=None, outputs=sdisps) + with gr.Tab("Critique"): + with gr.Row(): + refresh_btn = gr.Button("Refresh") + modify_btn = gr.Button("Modify") + critique_bx = gr.Textbox(label="Critique", lines=10, interactive=True) + refresh_btn.click(fn=self.get_state, inputs=gr.Number("critique", visible=False), outputs=critique_bx) + modify_btn.click(fn=self.modify_state, inputs=[gr.Number("critique", visible=False), + gr.Number("reflect", visible=False), + critique_bx], outputs=None).then( + fn=updt_disp, inputs=None, outputs=sdisps) + with gr.Tab("StateSnapShots"): + with gr.Row(): + refresh_btn = gr.Button("Refresh") + snapshots = gr.Textbox(label="State Snapshots Summaries") + refresh_btn.click(fn=get_snapshots, inputs=None, outputs=snapshots) + return demo + + def launch(self, share=None): + if port := os.getenv("PORT1"): + self.demo.launch(share=True, server_port=int(port), server_name="0.0.0.0") + else: + self.demo.launch(share=self.share) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 2e3aed1..3e590de 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,7 @@ yt-dlp # openai langchain langchain-openai +langgraph pandas opencv-python scenedetect From 55257e82396deae84fbeebba012d488704f0f175 Mon Sep 17 00:00:00 2001 From: deepbuzin Date: Thu, 22 Aug 2024 08:03:56 +0000 Subject: [PATCH 2/9] Refactor nodes through segment extraction --- agent.ipynb | 459 +++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 369 insertions(+), 90 deletions(-) diff --git a/agent.ipynb b/agent.ipynb index 1765ccb..651be48 100644 --- a/agent.ipynb +++ b/agent.ipynb @@ -28,6 +28,60 @@ ")" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class VideoInfo(BaseModel):\n", + " video_id: str\n", + " url: str\n", + " relative_video_path: str\n", + " subs: str\n", + " transcript: str\n", + "\n", + "\n", + "class SegmentInfo(BaseModel): # , Generic[OutputSchema]):\n", + " start_timestamp: str\n", + " end_timestamp: str\n", + " fps: float\n", + " # segment_info: Optional[OutputSchema]\n", + " video_id: str\n", + " # _frames: Optional[\n", + " # list[np.array]\n", + " # ] # List of raw frames that got into LLM. Added for debugging purposes.\n", + "\n", + " # @classmethod\n", + " # def from_frames(cls, start_frame, end_frame, fps, **kwargs):\n", + " # return cls(\n", + " # start_timestamp=seconds_to_ts(start_frame / fps),\n", + " # end_timestamp=seconds_to_ts(end_frame / fps),\n", + " # fps=fps,\n", + " # **kwargs,\n", + " # )\n", + "\n", + " @classmethod\n", + " def from_seconds(cls, start_seconds, end_seconds, **kwargs):\n", + " return cls(\n", + " start_timestamp=seconds_to_ts(start_seconds),\n", + " end_timestamp=seconds_to_ts(end_seconds),\n", + " **kwargs,\n", + " )\n", + "\n", + " # def to_str(self, skip: list[str] = []):\n", + " # # skip -> fields from segment_info\n", + " # # dict() works both with pydantic model and with with unparsed dict\n", + " # if self.segment_info:\n", + " # d = dict(self.segment_info)\n", + " # for s in skip:\n", + " # del d[s]\n", + " # d = \": \" + json.dumps(d)\n", + " # else:\n", + " # d = \"\"\n", + " # return f\"{self.start_timestamp}-{self.end_timestamp}{d}\"" + ] + }, { "cell_type": "code", "execution_count": null, @@ -39,7 +93,9 @@ "class AgentState(TypedDict):\n", "\tsearch_queries: List[str]\n", "\tvideo_ids: List[str]\n", - "\tclip_texts = List[str]\n", + "\tvideo_infos: List[VideoInfo]\n", + "\tclip_text_prompts = List[str]\n", + "\tsegment_infos: List[SegmentInfo]\n", "\tclues = List[str]\n", "\tannotations = List[str]" ] @@ -176,6 +232,71 @@ "config = DatagenConfig(**config_params)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import scrapetube\n", + "import yt_dlp\n", + "from datetime import datetime\n", + "from pathlib import Path\n", + "from collections import defaultdict\n", + "from datagen.core.sub_utils import vtt_to_txt\n", + "from datagen.detect_segments import get_segments\n", + "import torch\n", + "from transformers import AutoModel, AutoProcessor\n", + "import pandas as pd\n", + "from tsmoothie.smoother import LowessSmoother" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import decord\n", + "import math\n", + "import numpy as np\n", + "\n", + "decord.bridge.set_bridge(\"torch\")\n", + "\n", + "\n", + "class VideoInferenceDataset(torch.utils.data.IterableDataset):\n", + " def __init__(self, video_infos: List[VideoInfo]):\n", + " super(VideoInferenceDataset).__init__()\n", + "\n", + " self.video_infos = video_infos\n", + " self.frame_generator = self.get_frame_generator(video_infos)\n", + "\n", + " @staticmethod\n", + " def get_frame_generator(video_infos):\n", + "\n", + " for video_info in video_infos:\n", + " video_path = Path(video_info.relative_video_path)\n", + " vr = decord.VideoReader(str(video_path))\n", + " num_frames = len(vr)\n", + " fps = math.ceil(num_frames / video_info.duration)\n", + " frame_indices = range(0, num_frames, fps)\n", + "\n", + " for frame_idx in frame_indices:\n", + " frame = vr[frame_idx].asnumpy()\n", + " yield {\n", + " \"frame\": frame,\n", + " \"frame_idx\": frame_idx,\n", + " \"video_id\": video_info.video_id,\n", + " }\n", + "\n", + " def __next__(self):\n", + " return next(self.frame_generator)\n", + "\n", + " def __iter__(self):\n", + " # worker_info = torch.utils.data.get_worker_info()\n", + " return self" + ] + }, { "cell_type": "code", "execution_count": null, @@ -189,7 +310,7 @@ " class QueryList(BaseModel):\n", " \"\"\"A list of queries to find videos on a video hosting service\"\"\"\n", "\n", - " queries: list[str] = Field(default=None, description=\"a list of queries\")\n", + " search_queries: list[str] = Field(default=None, description=\"a list of queries\")\n", "\n", " messages = [\n", " SystemMessage(content=GEN_QUERIES_PROMPT),\n", @@ -197,106 +318,262 @@ " ]\n", "\n", " model = llm.with_structured_output(QueryList)\n", - " response = model.invoke(messages)\n", + " response: QueryList = model.invoke(messages)\n", "\n", - " return {\"search_queries\": response.content}\n", + " return {\"search_queries\": response.search_queries}\n", "\n", "\n", "def get_video_ids_node(state: AgentState):\n", - " ids = get_video_ids(\n", - " state[\"search_queries\"],\n", - " config=config,\n", - " videos_per_query=2,\n", - " only_creative_commons=False,\n", - " )\n", - " return {\"video_ids\": ids}\n", + "\n", + " queries = state[\"search_queries\"]\n", + " videos_per_query = 2\n", + " sleep = 0\n", + " sort_by = \"relevance\"\n", + " results_type = \"video\"\n", + " only_creative_commons = False\n", + "\n", + " video_ids = set()\n", + " for query in queries:\n", + " for video in scrapetube.get_search(\n", + " query=query,\n", + " limit=videos_per_query,\n", + " sleep=sleep,\n", + " sort_by=sort_by,\n", + " results_type=results_type,\n", + " ):\n", + " video_ids.add(video[\"videoId\"])\n", + " video_ids = list(video_ids)\n", + "\n", + " if only_creative_commons:\n", + " video_ids_cc = []\n", + " for i in video_ids:\n", + " YDL_OPTIONS = {\n", + " \"quiet\": True,\n", + " \"simulate\": True,\n", + " \"forceurl\": True,\n", + " }\n", + " with yt_dlp.YoutubeDL(YDL_OPTIONS) as ydl:\n", + " info = ydl.extract_info(f\"youtube.com/watch?v={i}\", download=False)\n", + " if \"creative commons\" in info.get(\"license\", \"\").lower():\n", + " video_ids_cc.append(i)\n", + " video_ids = video_ids_cc\n", + "\n", + " return {\"video_ids\": video_ids}\n", "\n", "\n", "def download_node(state: AgentState):\n", - " videos = download_videos(ids, config)\n", - " # save to state\n", - " return {\"something\": videos}\n", + "\n", + " LOCAL_ROOT = Path(\"./tmp/agent_squats\").resolve()\n", + " video_dir = LOCAL_ROOT / \"videos\"\n", + " sub_dir = LOCAL_ROOT / \"subs\"\n", + "\n", + " discard_path = LOCAL_ROOT / \"videos_without_subs\"\n", + " discard_path.mkdir(parents=True, exist_ok=True)\n", + "\n", + " video_ids = state[\"video_ids\"]\n", + "\n", + " downloaded_video_ids = [video_path.name for video_path in video_dir.glob(\"*.mp4\")]\n", + " downloaded_video_ids += [\n", + " video_path.name for video_path in discard_path.glob(\"*.mp4\")\n", + " ]\n", + "\n", + " only_with_transcripts = True\n", + "\n", + " YDL_OPTIONS = {\n", + " \"writeautomaticsub\": True,\n", + " \"subtitleslangs\": [\"en\"],\n", + " \"subtitlesformat\": \"vtt\",\n", + " \"overwrites\": False,\n", + " \"format\": \"mp4\",\n", + " \"outtmpl\": {\n", + " \"default\": video_dir.as_posix() + \"/%(id)s.%(ext)s\",\n", + " \"subtitle\": sub_dir.as_posix() + \"/%(id)s\",\n", + " },\n", + " }\n", + "\n", + " video_infos = []\n", + "\n", + " with yt_dlp.YoutubeDL(YDL_OPTIONS) as ydl:\n", + " for video_id in video_ids:\n", + " if video_id not in downloaded_video_ids:\n", + " try:\n", + " url = f\"https://www.youtube.com/watch?v={video_id}\"\n", + " ydl.download(url)\n", + " except Exception as e:\n", + " print(datetime.now(), f\"Error at video {video_id}, skipping\")\n", + " print(datetime.now(), e)\n", + " continue\n", + "\n", + " video_path = Path(ydl.prepare_filename({\"id\": video_id}))\n", + " sub_path = Path(ydl.prepare_filename({\"id\": video_id, \"ext\": \"en.vtt\"}))\n", + "\n", + " with sub_path.open(\"r\") as f:\n", + " subs = f.read()\n", + "\n", + " transcript = vtt_to_txt(sub_path)\n", + "\n", + " video_info = VideoInfo.from_local_download(\n", + " video_id=video_id,\n", + " url=url,\n", + " video_path=video_path.relative_to(LOCAL_ROOT).as_posix(),\n", + " subs=subs,\n", + " transcript=transcript,\n", + " )\n", + "\n", + " video_infos.append(video_info)\n", + "\n", + " if only_with_transcripts:\n", + " filtered_video_infos = []\n", + " for video_info in video_infos:\n", + " if video_info.transcript:\n", + " filtered_video_infos.append(video_info)\n", + " else:\n", + " video_path = LOCAL_ROOT / video_info.video_path\n", + " video_path.rename(discard_path / video_path.name)\n", + " video_infos = filtered_video_infos\n", + "\n", + " return {\"video_infos\": video_infos}\n", "\n", "\n", "def detect_segments_node(state: AgentState):\n", - " segments = detect_segments_clip(\n", - " # video_ids=['KvRK5Owqzgw'],\n", - " text_prompts=\"a person doing squats\", # that's the text for CLIP to compare to images. You can provide a list of texts to use average distance.\n", - " model=model,\n", - " processor=processor,\n", - " fps_sampling=2, # the more fps, the more granular segment borders and more precise segments, at the cost of speed.\n", - " device=\"cuda\", # 'cpu' for local\n", - " frames_per_batch=100, # 100 frames use about 10GB GPU RAM, so batch to fill your GPU RAM.\n", - " config=config,\n", - " )\n", - " return {\"segments\": segments}\n", "\n", + " clip_text_prompts = state[\"clip_text_prompts\"]\n", + " video_infos = state[\"video_infos\"]\n", "\n", - "def extract_clues_node(state: AgentState):\n", - " clues = []\n", - "\n", - " clues = generate_clues(\n", - " # video_ids=['byxWus7BwfQ'],\n", - " config=config,\n", - " human_prompt=human_prompt,\n", - " segments_per_call=5, # the output might be quite long, so need to limit number of segments per gpt call to respect max output legnth\n", - " raise_on_error=True, # interrupt when encountering an error. Useful for debugging.\n", - " )\n", - "\n", - " return {\"clues\": clues}\n", - "\n", - "\n", - "def gen_annotations_node(state: AgentState):\n", - "\n", - " class SegmentFeedback(BaseModel):\n", - " \"\"\"\n", - " —> GOOD EXAMPLES:\n", - " \"wrong\":\"Knees caving in: This can stress the knees and reduce effectiveness\"\n", - " \"correction\":\"Focus on keeping knees aligned with your toes.\"\n", - " \"wrong\":\"Rounding the back: This increases the risk of back injuries\"\n", - " \"correction\":\"Keep your chest up and maintain a neutral spine throughout the movement.\"\n", - " \"wrong\":\"Heels are lifting off the ground: this shifts the weight forward, reducing stability\"\n", - " \"correction\":\" Keep your weight on your heels and press through them as you rise.\"\n", - " \"right\":\"Chest and shoulders: The chest is up, and the shoulders are back, maintaining an upright torso.\"\n", - " \"correction\":null\n", - " —> BAD EXAMPLES:\n", - " \"wrong\":\"knees\"\n", - " \"correction\":\"fix knees\"\n", - " \"wrong\":\"back looks funny\"\n", - " \"correction\":\"make back better\"\n", - " \"wrong\":\"feet are doing something\"\n", - " \"correction\":\"feet should be different\"\n", - " \"right\":\"arms\"\n", - " \"correction\":\"arms are fine i think\"\n", - " —> BAD EXAMPLES END HERE\n", - " \"\"\"\n", - "\n", - " right: Optional[str] = Field(description=\"what was right in the performance\")\n", - " wrong: Optional[str] = Field(description=\"what was wrong in the performance\")\n", - " correction: Optional[str] = Field(\n", - " description=\"how and in what ways it the performance could be improved\"\n", - " )\n", + " CLIP_MODEL_ID = \"google/siglip-so400m-patch14-384\"\n", "\n", - " # The segment timestamps are taken from the provided information.\n", - " class SegmentAnnotation(BaseModel):\n", - " squats_probability: Optional[str] = Field(\n", - " description=\"how high is the probability that the person is doing squats in the segment: low, medium, high, unknown(null)\"\n", - " )\n", - " squats_technique_correctness: Optional[str] = Field(\n", - " description=\"correctness of the squat technique.\"\n", + " model = AutoModel.from_pretrained(CLIP_MODEL_ID)\n", + " processor = AutoProcessor.from_pretrained(CLIP_MODEL_ID)\n", + "\n", + " dataset = VideoInferenceDataset(video_infos)\n", + " dataloader = torch.utils.data.DataLoader(dataset, num_workers=0, batch_size=100)\n", + "\n", + " smoother = LowessSmoother(smooth_fraction=0.02, iterations=1)\n", + "\n", + " clip_results_dict = defaultdict(list)\n", + "\n", + " while True:\n", + " try:\n", + " batch = next(iter(dataloader))\n", + " except StopIteration:\n", + " break\n", + "\n", + " frames = batch[\"frame\"].to(\"cuda\")\n", + "\n", + " inputs = processor(\n", + " images=frames,\n", + " text=clip_text_prompts,\n", + " return_tensors=\"pt\",\n", + " padding=True,\n", + " truncation=True,\n", " )\n", - " squats_feedback: Optional[SegmentFeedback] = Field(\n", - " description=\"what was right and wrong in the squat perfomance in the segment. When the technique is incorrect, provide instructions how to correct them.\"\n", + "\n", + " logits = model(**inputs)\n", + " probs = torch.nn.functional.sigmoid(logits)\n", + "\n", + " for sample, prob in zip(batch, probs):\n", + " video_id = sample[\"video_id\"]\n", + " frame_idx = sample[\"frame_idx\"]\n", + " clip_results_dict[\"video_id\"].append(video_id)\n", + " clip_results_dict[\"frame_idx\"].append(frame_idx)\n", + " clip_results_dict[\"probs\"].append(prob.item())\n", + "\n", + " clip_results = pd.DataFrame(clip_results_dict)\n", + "\n", + " max_gap_seconds = 1\n", + " fps_sampling = 1\n", + " min_prob = 0.1\n", + " min_segment_seconds = 3\n", + " fps = 25\n", + "\n", + " for video_clip_results in clip_results.groupby(\"video_id\"):\n", + " probs = video_clip_results[\"probs\"].values\n", + " probs = smoother.smooth(probs).smooth_data[0]\n", + " segments_start_end = get_segments(\n", + " probs,\n", + " max_gap=round(max_gap_seconds * fps_sampling),\n", + " min_prob=min_prob,\n", + " min_segment=round(min_segment_seconds * fps_sampling),\n", " )\n", + " segments = []\n", + " for start, end in segments_start_end:\n", + " segments.append(\n", + " SegmentInfo.from_seconds(\n", + " start,\n", + " end,\n", + " fps=fps,\n", + " video_id=\"a\",\n", + " )\n", + " )\n", + "\n", + " return {\"segments\": segments}\n", "\n", - " annotations = generate_annotations(\n", - " human_prompt=human_prompt,\n", - " config=config,\n", - " segments_per_call=5,\n", - " annotation_schema=SegmentAnnotation,\n", - " )\n", "\n", - " return {\"annotations\": annotations}" + "# def extract_clues_node(state: AgentState):\n", + "# clues = []\n", + "\n", + "# clues = generate_clues(\n", + "# # video_ids=['byxWus7BwfQ'],\n", + "# config=config,\n", + "# human_prompt=human_prompt,\n", + "# segments_per_call=5, # the output might be quite long, so need to limit number of segments per gpt call to respect max output legnth\n", + "# raise_on_error=True, # interrupt when encountering an error. Useful for debugging.\n", + "# )\n", + "\n", + "# return {\"clues\": clues}\n", + "\n", + "\n", + "# def gen_annotations_node(state: AgentState):\n", + "\n", + "# class SegmentFeedback(BaseModel):\n", + "# \"\"\"\n", + "# —> GOOD EXAMPLES:\n", + "# \"wrong\":\"Knees caving in: This can stress the knees and reduce effectiveness\"\n", + "# \"correction\":\"Focus on keeping knees aligned with your toes.\"\n", + "# \"wrong\":\"Rounding the back: This increases the risk of back injuries\"\n", + "# \"correction\":\"Keep your chest up and maintain a neutral spine throughout the movement.\"\n", + "# \"wrong\":\"Heels are lifting off the ground: this shifts the weight forward, reducing stability\"\n", + "# \"correction\":\" Keep your weight on your heels and press through them as you rise.\"\n", + "# \"right\":\"Chest and shoulders: The chest is up, and the shoulders are back, maintaining an upright torso.\"\n", + "# \"correction\":null\n", + "# —> BAD EXAMPLES:\n", + "# \"wrong\":\"knees\"\n", + "# \"correction\":\"fix knees\"\n", + "# \"wrong\":\"back looks funny\"\n", + "# \"correction\":\"make back better\"\n", + "# \"wrong\":\"feet are doing something\"\n", + "# \"correction\":\"feet should be different\"\n", + "# \"right\":\"arms\"\n", + "# \"correction\":\"arms are fine i think\"\n", + "# —> BAD EXAMPLES END HERE\n", + "# \"\"\"\n", + "\n", + "# right: Optional[str] = Field(description=\"what was right in the performance\")\n", + "# wrong: Optional[str] = Field(description=\"what was wrong in the performance\")\n", + "# correction: Optional[str] = Field(\n", + "# description=\"how and in what ways it the performance could be improved\"\n", + "# )\n", + "\n", + "# # The segment timestamps are taken from the provided information.\n", + "# class SegmentAnnotation(BaseModel):\n", + "# squats_probability: Optional[str] = Field(\n", + "# description=\"how high is the probability that the person is doing squats in the segment: low, medium, high, unknown(null)\"\n", + "# )\n", + "# squats_technique_correctness: Optional[str] = Field(\n", + "# description=\"correctness of the squat technique.\"\n", + "# )\n", + "# squats_feedback: Optional[SegmentFeedback] = Field(\n", + "# description=\"what was right and wrong in the squat perfomance in the segment. When the technique is incorrect, provide instructions how to correct them.\"\n", + "# )\n", + "\n", + "# annotations = generate_annotations(\n", + "# human_prompt=human_prompt,\n", + "# config=config,\n", + "# segments_per_call=5,\n", + "# annotation_schema=SegmentAnnotation,\n", + "# )\n", + "\n", + "# return {\"annotations\": annotations}" ] }, { @@ -328,8 +605,8 @@ "builder.add_node(\"get_video_ids\", get_video_ids_node)\n", "builder.add_node(\"download\", download_node)\n", "builder.add_node(\"detect_segments\", detect_segments_node)\n", - "builder.add_node(\"extract_clues\", extract_clues_node)\n", - "builder.add_node(\"gen_annotations\", gen_annotations_node)\n", + "# builder.add_node(\"extract_clues\", extract_clues_node)\n", + "# builder.add_node(\"gen_annotations\", gen_annotations_node)\n", "\n", "builder.set_entry_point(\"generate_queries\")\n", "\n", @@ -342,9 +619,11 @@ "builder.add_edge(\"generate_queries\", \"get_video_ids\")\n", "builder.add_edge(\"get_video_ids\", \"download\")\n", "builder.add_edge(\"download\", \"detect_segments\")\n", - "builder.add_edge(\"detect_segments\", \"extract_clues\")\n", - "builder.add_edge(\"extract_clues\", \"gen_annotations\")\n", - "builder.add_edge(\"gen_annotations\", END)\n", + "builder.add_edge(\"detect_segments\", END)\n", + "\n", + "# builder.add_edge(\"detect_segments\", \"extract_clues\")\n", + "# builder.add_edge(\"extract_clues\", \"gen_annotations\")\n", + "# builder.add_edge(\"gen_annotations\", END)\n", "\n", "graph = builder.compile(checkpointer=memory)" ] From a179d62f4233d7027c9253f1060b80bb528c8867 Mon Sep 17 00:00:00 2001 From: deepbuzin Date: Fri, 23 Aug 2024 13:26:03 +0000 Subject: [PATCH 3/9] Add decord to requirements, update agent notebook to run first 3 nodes --- agent.ipynb | 158 ++++++++++++++++++++++++++++++++++++++--------- requirements.txt | 3 +- 2 files changed, 130 insertions(+), 31 deletions(-) diff --git a/agent.ipynb b/agent.ipynb index 651be48..097c7ad 100644 --- a/agent.ipynb +++ b/agent.ipynb @@ -2,7 +2,17 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from dotenv import load_dotenv\n", + "_ = load_dotenv()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -15,7 +25,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -30,7 +40,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -84,17 +94,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# 2. Create the state\n", "\n", "class AgentState(TypedDict):\n", + "\ttask: str\n", "\tsearch_queries: List[str]\n", "\tvideo_ids: List[str]\n", "\tvideo_infos: List[VideoInfo]\n", - "\tclip_text_prompts = List[str]\n", + "\tclip_text_prompts: List[str] = [\"person doing squats\"]\n", "\tsegment_infos: List[SegmentInfo]\n", "\tclues = List[str]\n", "\tannotations = List[str]" @@ -102,7 +113,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -211,30 +222,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ - "from datagen import DatagenConfig, get_video_ids, download_videos, detect_segments_clip, generate_clues, generate_annotations\n", + "# from datagen import DatagenConfig, get_video_ids, download_videos, detect_segments_clip, generate_clues, generate_annotations\n", "\n", - "config_params = {\n", - " \"openai\": {\n", - " \"type\": \"azure\", # openai/azure\n", - " \"temperature\": \"1\",\n", - " \"deployment\": \"gpt4o\", # model for openai / deployment for azure\n", - " },\n", - " \"data_dir\": \"./tmp/squats\",\n", - "}\n", + "# config_params = {\n", + "# \"openai\": {\n", + "# \"type\": \"azure\", # openai/azure\n", + "# \"temperature\": \"1\",\n", + "# \"deployment\": \"gpt4o\", # model for openai / deployment for azure\n", + "# },\n", + "# \"data_dir\": \"./tmp/squats\",\n", + "# }\n", "\n", - "!mkdir -p {config_params[\"data_dir\"]}\n", + "# !mkdir -p {config_params[\"data_dir\"]}\n", "\n", - "# this config handles all the bookeeping so you need to pass it everywhere.\n", - "config = DatagenConfig(**config_params)" + "# # this config handles all the bookeeping so you need to pass it everywhere.\n", + "# config = DatagenConfig(**config_params)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -253,7 +264,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -299,7 +310,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -313,20 +324,20 @@ " search_queries: list[str] = Field(default=None, description=\"a list of queries\")\n", "\n", " messages = [\n", - " SystemMessage(content=GEN_QUERIES_PROMPT),\n", + " SystemMessage(content=str(GEN_QUERIES_PROMPT)),\n", " HumanMessage(content=state[\"task\"]),\n", " ]\n", "\n", " model = llm.with_structured_output(QueryList)\n", " response: QueryList = model.invoke(messages)\n", "\n", - " return {\"search_queries\": response.search_queries}\n", + " return {\"search_queries\": response.search_queries[:2]}\n", "\n", "\n", "def get_video_ids_node(state: AgentState):\n", "\n", " queries = state[\"search_queries\"]\n", - " videos_per_query = 2\n", + " videos_per_query = 1\n", " sleep = 0\n", " sort_by = \"relevance\"\n", " results_type = \"video\"\n", @@ -578,7 +589,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -595,7 +606,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -630,19 +641,98 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'generate_queries': {'search_queries': ['how to do squats', 'squat exercise tutorial']}}\n", + "{'get_video_ids': {'video_ids': ['xqvCmoLULNY', 'IB_icWRzi4E']}}\n", + "[youtube] Extracting URL: https://www.youtube.com/watch?v=xqvCmoLULNY\n", + "[youtube] xqvCmoLULNY: Downloading webpage\n", + "[youtube] xqvCmoLULNY: Downloading ios player API JSON\n", + "[youtube] xqvCmoLULNY: Downloading web creator player API JSON\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "ERROR: [youtube] xqvCmoLULNY: Sign in to confirm you’re not a bot. This helps protect our community. Learn more\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2024-08-22 08:30:23.552248 Error at video xqvCmoLULNY, skipping\n", + "2024-08-22 08:30:23.552297 ERROR: [youtube] xqvCmoLULNY: Sign in to confirm you’re not a bot. This helps protect our community. Learn more\n", + "[youtube] Extracting URL: https://www.youtube.com/watch?v=IB_icWRzi4E\n", + "[youtube] IB_icWRzi4E: Downloading webpage\n", + "[youtube] IB_icWRzi4E: Downloading ios player API JSON\n", + "[youtube] IB_icWRzi4E: Downloading web creator player API JSON\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "ERROR: [youtube] IB_icWRzi4E: Sign in to confirm you’re not a bot. This helps protect our community. Learn more\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2024-08-22 08:30:24.139454 Error at video IB_icWRzi4E, skipping\n", + "2024-08-22 08:30:24.139505 ERROR: [youtube] IB_icWRzi4E: Sign in to confirm you’re not a bot. This helps protect our community. Learn more\n", + "{'download': {'video_infos': []}}\n" + ] + }, + { + "ename": "KeyError", + "evalue": "'video_id'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[13], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m thread \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mconfigurable\u001b[39m\u001b[38;5;124m\"\u001b[39m: {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mthread_id\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m1\u001b[39m\u001b[38;5;124m\"\u001b[39m}}\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43ms\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mgraph\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstream\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[43m \u001b[49m\u001b[43m{\u001b[49m\n\u001b[1;32m 4\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtask\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mi wanna teach people how to do squats\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[43m \u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[43m \u001b[49m\u001b[43mthread\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mprint\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43ms\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.conda/envs/databuilder_agent/lib/python3.12/site-packages/langgraph/pregel/__init__.py:1029\u001b[0m, in \u001b[0;36mPregel.stream\u001b[0;34m(self, input, config, stream_mode, output_keys, interrupt_before, interrupt_after, debug)\u001b[0m\n\u001b[1;32m 1026\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m 1028\u001b[0m \u001b[38;5;66;03m# panic on failure or timeout\u001b[39;00m\n\u001b[0;32m-> 1029\u001b[0m \u001b[43m_panic_or_proceed\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdone\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minflight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mloop\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstep\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1030\u001b[0m \u001b[38;5;66;03m# don't keep futures around in memory longer than needed\u001b[39;00m\n\u001b[1;32m 1031\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m done, inflight, futures\n", + "File \u001b[0;32m~/.conda/envs/databuilder_agent/lib/python3.12/site-packages/langgraph/pregel/__init__.py:1456\u001b[0m, in \u001b[0;36m_panic_or_proceed\u001b[0;34m(done, inflight, step, timeout_exc_cls)\u001b[0m\n\u001b[1;32m 1454\u001b[0m inflight\u001b[38;5;241m.\u001b[39mpop()\u001b[38;5;241m.\u001b[39mcancel()\n\u001b[1;32m 1455\u001b[0m \u001b[38;5;66;03m# raise the exception\u001b[39;00m\n\u001b[0;32m-> 1456\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exc\n\u001b[1;32m 1458\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inflight:\n\u001b[1;32m 1459\u001b[0m \u001b[38;5;66;03m# if we got here means we timed out\u001b[39;00m\n\u001b[1;32m 1460\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m inflight:\n\u001b[1;32m 1461\u001b[0m \u001b[38;5;66;03m# cancel all pending tasks\u001b[39;00m\n", + "File \u001b[0;32m~/.conda/envs/databuilder_agent/lib/python3.12/site-packages/langgraph/pregel/executor.py:60\u001b[0m, in \u001b[0;36mBackgroundExecutor.done\u001b[0;34m(self, task)\u001b[0m\n\u001b[1;32m 58\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdone\u001b[39m(\u001b[38;5;28mself\u001b[39m, task: concurrent\u001b[38;5;241m.\u001b[39mfutures\u001b[38;5;241m.\u001b[39mFuture) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 59\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m---> 60\u001b[0m \u001b[43mtask\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 61\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m GraphInterrupt:\n\u001b[1;32m 62\u001b[0m \u001b[38;5;66;03m# This exception is an interruption signal, not an error\u001b[39;00m\n\u001b[1;32m 63\u001b[0m \u001b[38;5;66;03m# so we don't want to re-raise it on exit\u001b[39;00m\n\u001b[1;32m 64\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtasks\u001b[38;5;241m.\u001b[39mpop(task)\n", + "File \u001b[0;32m~/.conda/envs/databuilder_agent/lib/python3.12/concurrent/futures/_base.py:449\u001b[0m, in \u001b[0;36mFuture.result\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 447\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m CancelledError()\n\u001b[1;32m 448\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_state \u001b[38;5;241m==\u001b[39m FINISHED:\n\u001b[0;32m--> 449\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__get_result\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 451\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_condition\u001b[38;5;241m.\u001b[39mwait(timeout)\n\u001b[1;32m 453\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_state \u001b[38;5;129;01min\u001b[39;00m [CANCELLED, CANCELLED_AND_NOTIFIED]:\n", + "File \u001b[0;32m~/.conda/envs/databuilder_agent/lib/python3.12/concurrent/futures/_base.py:401\u001b[0m, in \u001b[0;36mFuture.__get_result\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 399\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_exception:\n\u001b[1;32m 400\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 401\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_exception\n\u001b[1;32m 402\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 403\u001b[0m \u001b[38;5;66;03m# Break a reference cycle with the exception in self._exception\u001b[39;00m\n\u001b[1;32m 404\u001b[0m \u001b[38;5;28mself\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", + "File \u001b[0;32m~/.conda/envs/databuilder_agent/lib/python3.12/concurrent/futures/thread.py:58\u001b[0m, in \u001b[0;36m_WorkItem.run\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[1;32m 57\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m---> 58\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 59\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mBaseException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 60\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfuture\u001b[38;5;241m.\u001b[39mset_exception(exc)\n", + "File \u001b[0;32m~/.conda/envs/databuilder_agent/lib/python3.12/site-packages/langgraph/pregel/retry.py:25\u001b[0m, in \u001b[0;36mrun_with_retry\u001b[0;34m(task, retry_policy)\u001b[0m\n\u001b[1;32m 23\u001b[0m task\u001b[38;5;241m.\u001b[39mwrites\u001b[38;5;241m.\u001b[39mclear()\n\u001b[1;32m 24\u001b[0m \u001b[38;5;66;03m# run the task\u001b[39;00m\n\u001b[0;32m---> 25\u001b[0m \u001b[43mtask\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mproc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minvoke\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtask\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minput\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtask\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 26\u001b[0m \u001b[38;5;66;03m# if successful, end\u001b[39;00m\n\u001b[1;32m 27\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n", + "File \u001b[0;32m~/.conda/envs/databuilder_agent/lib/python3.12/site-packages/langchain_core/runnables/base.py:2876\u001b[0m, in \u001b[0;36mRunnableSequence.invoke\u001b[0;34m(self, input, config, **kwargs)\u001b[0m\n\u001b[1;32m 2874\u001b[0m context\u001b[38;5;241m.\u001b[39mrun(_set_config_context, config)\n\u001b[1;32m 2875\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m i \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[0;32m-> 2876\u001b[0m \u001b[38;5;28minput\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[43mcontext\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstep\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minvoke\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2877\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 2878\u001b[0m \u001b[38;5;28minput\u001b[39m \u001b[38;5;241m=\u001b[39m context\u001b[38;5;241m.\u001b[39mrun(step\u001b[38;5;241m.\u001b[39minvoke, \u001b[38;5;28minput\u001b[39m, config)\n", + "File \u001b[0;32m~/.conda/envs/databuilder_agent/lib/python3.12/site-packages/langgraph/utils.py:102\u001b[0m, in \u001b[0;36mRunnableCallable.invoke\u001b[0;34m(self, input, config, **kwargs)\u001b[0m\n\u001b[1;32m 100\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m accepts_config(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfunc):\n\u001b[1;32m 101\u001b[0m kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mconfig\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m config\n\u001b[0;32m--> 102\u001b[0m ret \u001b[38;5;241m=\u001b[39m \u001b[43mcontext\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfunc\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 103\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(ret, Runnable) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrecurse:\n\u001b[1;32m 104\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ret\u001b[38;5;241m.\u001b[39minvoke(\u001b[38;5;28minput\u001b[39m, config)\n", + "Cell \u001b[0;32mIn[10], line 184\u001b[0m, in \u001b[0;36mdetect_segments_node\u001b[0;34m(state)\u001b[0m\n\u001b[1;32m 181\u001b[0m min_segment_seconds \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m3\u001b[39m\n\u001b[1;32m 182\u001b[0m fps \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m25\u001b[39m\n\u001b[0;32m--> 184\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m video_clip_results \u001b[38;5;129;01min\u001b[39;00m \u001b[43mclip_results\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgroupby\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mvideo_id\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 185\u001b[0m probs \u001b[38;5;241m=\u001b[39m video_clip_results[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mprobs\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39mvalues\n\u001b[1;32m 186\u001b[0m probs \u001b[38;5;241m=\u001b[39m smoother\u001b[38;5;241m.\u001b[39msmooth(probs)\u001b[38;5;241m.\u001b[39msmooth_data[\u001b[38;5;241m0\u001b[39m]\n", + "File \u001b[0;32m~/.conda/envs/databuilder_agent/lib/python3.12/site-packages/pandas/core/frame.py:9183\u001b[0m, in \u001b[0;36mDataFrame.groupby\u001b[0;34m(self, by, axis, level, as_index, sort, group_keys, observed, dropna)\u001b[0m\n\u001b[1;32m 9180\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m level \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m by \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 9181\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mYou have to supply one of \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mby\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m and \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlevel\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m-> 9183\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mDataFrameGroupBy\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 9184\u001b[0m \u001b[43m \u001b[49m\u001b[43mobj\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9185\u001b[0m \u001b[43m \u001b[49m\u001b[43mkeys\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mby\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9186\u001b[0m \u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9187\u001b[0m \u001b[43m \u001b[49m\u001b[43mlevel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlevel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9188\u001b[0m \u001b[43m \u001b[49m\u001b[43mas_index\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mas_index\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9189\u001b[0m \u001b[43m \u001b[49m\u001b[43msort\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msort\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9190\u001b[0m \u001b[43m \u001b[49m\u001b[43mgroup_keys\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup_keys\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9191\u001b[0m \u001b[43m \u001b[49m\u001b[43mobserved\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mobserved\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9192\u001b[0m \u001b[43m \u001b[49m\u001b[43mdropna\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdropna\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9193\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.conda/envs/databuilder_agent/lib/python3.12/site-packages/pandas/core/groupby/groupby.py:1329\u001b[0m, in \u001b[0;36mGroupBy.__init__\u001b[0;34m(self, obj, keys, axis, level, grouper, exclusions, selection, as_index, sort, group_keys, observed, dropna)\u001b[0m\n\u001b[1;32m 1326\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdropna \u001b[38;5;241m=\u001b[39m dropna\n\u001b[1;32m 1328\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m grouper \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m-> 1329\u001b[0m grouper, exclusions, obj \u001b[38;5;241m=\u001b[39m \u001b[43mget_grouper\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1330\u001b[0m \u001b[43m \u001b[49m\u001b[43mobj\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1331\u001b[0m \u001b[43m \u001b[49m\u001b[43mkeys\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1332\u001b[0m \u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1333\u001b[0m \u001b[43m \u001b[49m\u001b[43mlevel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlevel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1334\u001b[0m \u001b[43m \u001b[49m\u001b[43msort\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msort\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1335\u001b[0m \u001b[43m \u001b[49m\u001b[43mobserved\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mobserved\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mis\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mlib\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mno_default\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mobserved\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1336\u001b[0m \u001b[43m \u001b[49m\u001b[43mdropna\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdropna\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1337\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1339\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m observed \u001b[38;5;129;01mis\u001b[39;00m lib\u001b[38;5;241m.\u001b[39mno_default:\n\u001b[1;32m 1340\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28many\u001b[39m(ping\u001b[38;5;241m.\u001b[39m_passed_categorical \u001b[38;5;28;01mfor\u001b[39;00m ping \u001b[38;5;129;01min\u001b[39;00m grouper\u001b[38;5;241m.\u001b[39mgroupings):\n", + "File \u001b[0;32m~/.conda/envs/databuilder_agent/lib/python3.12/site-packages/pandas/core/groupby/grouper.py:1043\u001b[0m, in \u001b[0;36mget_grouper\u001b[0;34m(obj, key, axis, level, sort, observed, validate, dropna)\u001b[0m\n\u001b[1;32m 1041\u001b[0m in_axis, level, gpr \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m, gpr, \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1042\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1043\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(gpr)\n\u001b[1;32m 1044\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(gpr, Grouper) \u001b[38;5;129;01mand\u001b[39;00m gpr\u001b[38;5;241m.\u001b[39mkey \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1045\u001b[0m \u001b[38;5;66;03m# Add key to exclusions\u001b[39;00m\n\u001b[1;32m 1046\u001b[0m exclusions\u001b[38;5;241m.\u001b[39madd(gpr\u001b[38;5;241m.\u001b[39mkey)\n", + "\u001b[0;31mKeyError\u001b[0m: 'video_id'" + ] + } + ], "source": [ "thread = {\"configurable\": {\"thread_id\": \"1\"}}\n", "for s in graph.stream(\n", " {\n", - " \"task\": \"what is the difference between langchain and langsmith\",\n", + " \"task\": \"i wanna teach people how to do squats\",\n", " },\n", " thread,\n", "):\n", " print(s)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -652,7 +742,15 @@ "name": "python3" }, "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", "version": "3.12.0" } }, diff --git a/requirements.txt b/requirements.txt index 3e590de..3b41483 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,4 +18,5 @@ tsmoothie torch sentencepiece protobuf -transformers \ No newline at end of file +transformers +decord \ No newline at end of file From f9633ebdde748794cdfe24f59c7f2f195ec51c63 Mon Sep 17 00:00:00 2001 From: deepbuzin Date: Mon, 26 Aug 2024 10:16:43 +0000 Subject: [PATCH 4/9] Debug agent through segments --- agent.ipynb | 554 +++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 444 insertions(+), 110 deletions(-) diff --git a/agent.ipynb b/agent.ipynb index 097c7ad..3e17daa 100644 --- a/agent.ipynb +++ b/agent.ipynb @@ -71,13 +71,13 @@ " # **kwargs,\n", " # )\n", "\n", - " @classmethod\n", - " def from_seconds(cls, start_seconds, end_seconds, **kwargs):\n", - " return cls(\n", - " start_timestamp=seconds_to_ts(start_seconds),\n", - " end_timestamp=seconds_to_ts(end_seconds),\n", - " **kwargs,\n", - " )\n", + " # @classmethod\n", + " # def from_seconds(cls, start_seconds, end_seconds, **kwargs):\n", + " # return cls(\n", + " # start_timestamp=seconds_to_ts(start_seconds),\n", + " # end_timestamp=seconds_to_ts(end_seconds),\n", + " # **kwargs,\n", + " # )\n", "\n", " # def to_str(self, skip: list[str] = []):\n", " # # skip -> fields from segment_info\n", @@ -105,7 +105,7 @@ "\tsearch_queries: List[str]\n", "\tvideo_ids: List[str]\n", "\tvideo_infos: List[VideoInfo]\n", - "\tclip_text_prompts: List[str] = [\"person doing squats\"]\n", + "\tclip_text_prompts: List[str]\n", "\tsegment_infos: List[SegmentInfo]\n", "\tclues = List[str]\n", "\tannotations = List[str]" @@ -272,39 +272,48 @@ "import math\n", "import numpy as np\n", "\n", - "decord.bridge.set_bridge(\"torch\")\n", + "# decord.bridge.set_bridge(\"torch\")\n", "\n", "\n", "class VideoInferenceDataset(torch.utils.data.IterableDataset):\n", - " def __init__(self, video_infos: List[VideoInfo]):\n", + " def __init__(self, video_infos: List[VideoInfo], local_root: Path):\n", " super(VideoInferenceDataset).__init__()\n", "\n", " self.video_infos = video_infos\n", - " self.frame_generator = self.get_frame_generator(video_infos)\n", + " self.local_root = local_root\n", + " self.frame_generator = self.get_frame_generator(video_infos, local_root)\n", "\n", " @staticmethod\n", - " def get_frame_generator(video_infos):\n", + " def get_frame_generator(video_infos, local_root: Path):\n", "\n", - " for video_info in video_infos:\n", - " video_path = Path(video_info.relative_video_path)\n", + " for video_idx, video_info in enumerate(video_infos):\n", + " video_path = local_root.joinpath(video_info.relative_video_path)\n", " vr = decord.VideoReader(str(video_path))\n", " num_frames = len(vr)\n", - " fps = math.ceil(num_frames / video_info.duration)\n", - " frame_indices = range(0, num_frames, fps)\n", + " fps = vr.get_avg_fps()\n", + " frame_indices = range(0, num_frames, round(fps))\n", + "\n", + " # print(f\"Num frames: {num_frames}, fps: {fps}\")\n", + " # print(f\"Len frame indices: {len(frame_indices)}\")\n", + "\n", + " # frames = vr.get_batch(frame_indices)\n", "\n", " for frame_idx in frame_indices:\n", + " print(f\"Frame idx {frame_idx}\")\n", " frame = vr[frame_idx].asnumpy()\n", " yield {\n", " \"frame\": frame,\n", " \"frame_idx\": frame_idx,\n", - " \"video_id\": video_info.video_id,\n", + " \"video_id\": video_idx,\n", " }\n", "\n", + " # print(\"video done\")\n", + " # print(\"all videos done\")\n", + "\n", " def __next__(self):\n", " return next(self.frame_generator)\n", "\n", " def __iter__(self):\n", - " # worker_info = torch.utils.data.get_worker_info()\n", " return self" ] }, @@ -314,6 +323,9 @@ "metadata": {}, "outputs": [], "source": [ + "import time\n", + "import math\n", + "\n", "# 4. Create nodes\n", "\n", "\n", @@ -383,11 +395,13 @@ "\n", " video_ids = state[\"video_ids\"]\n", "\n", - " downloaded_video_ids = [video_path.name for video_path in video_dir.glob(\"*.mp4\")]\n", + " downloaded_video_ids = [video_path.stem for video_path in video_dir.glob(\"*.mp4\")]\n", " downloaded_video_ids += [\n", - " video_path.name for video_path in discard_path.glob(\"*.mp4\")\n", + " video_path.stem for video_path in discard_path.glob(\"*.mp4\")\n", " ]\n", "\n", + " print(f\"Downloaded video ids: {downloaded_video_ids}\")\n", + "\n", " only_with_transcripts = True\n", "\n", " YDL_OPTIONS = {\n", @@ -398,7 +412,7 @@ " \"format\": \"mp4\",\n", " \"outtmpl\": {\n", " \"default\": video_dir.as_posix() + \"/%(id)s.%(ext)s\",\n", - " \"subtitle\": sub_dir.as_posix() + \"/%(id)s\",\n", + " \"subtitle\": sub_dir.as_posix() + \"/%(id)s.%(ext)s\",\n", " },\n", " }\n", "\n", @@ -406,27 +420,32 @@ "\n", " with yt_dlp.YoutubeDL(YDL_OPTIONS) as ydl:\n", " for video_id in video_ids:\n", + " url = f\"https://www.youtube.com/watch?v={video_id}\"\n", + "\n", " if video_id not in downloaded_video_ids:\n", " try:\n", - " url = f\"https://www.youtube.com/watch?v={video_id}\"\n", " ydl.download(url)\n", " except Exception as e:\n", " print(datetime.now(), f\"Error at video {video_id}, skipping\")\n", " print(datetime.now(), e)\n", " continue\n", "\n", - " video_path = Path(ydl.prepare_filename({\"id\": video_id}))\n", - " sub_path = Path(ydl.prepare_filename({\"id\": video_id, \"ext\": \"en.vtt\"}))\n", + " video_path = Path(ydl.prepare_filename({\"id\": video_id, \"ext\": \"mp4\"}))\n", + " sub_path = Path(\n", + " ydl.prepare_filename(\n", + " {\"id\": video_id, \"ext\": \"en.vtt\"}, dir_type=\"subtitle\"\n", + " )\n", + " )\n", "\n", " with sub_path.open(\"r\") as f:\n", " subs = f.read()\n", "\n", " transcript = vtt_to_txt(sub_path)\n", "\n", - " video_info = VideoInfo.from_local_download(\n", + " video_info = VideoInfo(\n", " video_id=video_id,\n", " url=url,\n", - " video_path=video_path.relative_to(LOCAL_ROOT).as_posix(),\n", + " relative_video_path=video_path.relative_to(LOCAL_ROOT).as_posix(),\n", " subs=subs,\n", " transcript=transcript,\n", " )\n", @@ -446,50 +465,125 @@ " return {\"video_infos\": video_infos}\n", "\n", "\n", + "DATAFRAME = None\n", + "\n", + "\n", "def detect_segments_node(state: AgentState):\n", "\n", + " LOCAL_ROOT = Path(\"./tmp/agent_squats\").resolve()\n", + "\n", " clip_text_prompts = state[\"clip_text_prompts\"]\n", " video_infos = state[\"video_infos\"]\n", "\n", " CLIP_MODEL_ID = \"google/siglip-so400m-patch14-384\"\n", "\n", - " model = AutoModel.from_pretrained(CLIP_MODEL_ID)\n", + " model = AutoModel.from_pretrained(CLIP_MODEL_ID).to(\"cuda\")\n", " processor = AutoProcessor.from_pretrained(CLIP_MODEL_ID)\n", "\n", - " dataset = VideoInferenceDataset(video_infos)\n", - " dataloader = torch.utils.data.DataLoader(dataset, num_workers=0, batch_size=100)\n", + " dataset = VideoInferenceDataset(video_infos, LOCAL_ROOT)\n", + "\n", + " # # Define a `worker_init_fn` that configures each dataset copy differently\n", + " # def worker_init_fn(worker_id):\n", + " # worker_info = torch.utils.data.get_worker_info()\n", + " # worker_dataset = worker_info.dataset # the dataset copy in this worker process\n", + " # video_infos = worker_dataset.video_infos\n", + "\n", + " # chunk_size = math.ceil(len(video_infos) / worker_info.num_workers)\n", + " # video_infos_chunks = [\n", + " # video_infos[i : i + chunk_size]\n", + " # for i in range(0, len(video_infos), chunk_size)\n", + " # ]\n", + "\n", + " # worker_dataset.video_infos = video_infos_chunks[worker_info.id]\n", + "\n", + " # # print(worker_dataset.video_infos)\n", + " # print(f\"Worker {worker_info.id} initialized\")\n", + "\n", + " # configure the dataset to only process the split workload\n", + "\n", + " # per_worker = math.ceil(len(dataset) / float(worker_info.num_workers))\n", + "\n", + " # worker_id = worker_info.id\n", + " # dataset.start = overall_start + worker_id * per_worker\n", + " # dataset.end = min(dataset.start + per_worker, overall_end)\n", + "\n", + " dataloader = torch.utils.data.DataLoader(\n", + " dataset,\n", + " num_workers=1,\n", + " batch_size=12,\n", + " pin_memory=True,\n", + " # worker_init_fn=worker_init_fn,\n", + " )\n", + " dataloader = iter(dataloader)\n", "\n", " smoother = LowessSmoother(smooth_fraction=0.02, iterations=1)\n", "\n", " clip_results_dict = defaultdict(list)\n", "\n", - " while True:\n", + " print(\"Init model complete\")\n", + "\n", + " batch_counter = 0\n", + " MAX_BATCHES = 50\n", + "\n", + " while batch_counter < MAX_BATCHES:\n", + " batch_counter += 1\n", " try:\n", - " batch = next(iter(dataloader))\n", + " start_time = time.time()\n", + " batch = next(dataloader)\n", + " # print(f\"Fetch time: {time.time() - start_time:.2f} seconds\")\n", " except StopIteration:\n", " break\n", "\n", - " frames = batch[\"frame\"].to(\"cuda\")\n", + " # print(\"Batch fetched\")\n", + " # # print(batch)\n", + "\n", + " # time.sleep(30)\n", "\n", + " start_time = time.time()\n", " inputs = processor(\n", - " images=frames,\n", + " images=batch[\"frame\"],\n", " text=clip_text_prompts,\n", " return_tensors=\"pt\",\n", " padding=True,\n", " truncation=True,\n", " )\n", + " inputs = {k: v.to(\"cuda\") for k, v in inputs.items()}\n", + " # print(f\"Proc time: {time.time() - start_time:.2f} seconds\")\n", + "\n", + " # print(\"Inputs prepared\")\n", + " # time.sleep(5)\n", + " # print(inputs[\"pixel_values\"].shape)\n", + " # print(inputs[\"input_ids\"].shape)\n", + "\n", + " start_time = time.time()\n", + " outputs = model(**inputs)\n", + " # print(f\"Forward time: {time.time() - start_time:.2f} seconds\")\n", + "\n", + " logits = outputs.logits_per_image\n", + " probs = torch.nn.functional.sigmoid(logits).detach().cpu().numpy()\n", "\n", - " logits = model(**inputs)\n", - " probs = torch.nn.functional.sigmoid(logits)\n", + " # print(\"Forward pass complete\")\n", + " # print(f\"video_id {len(batch[\"video_id\"])}\")\n", + " # print(len(probs))\n", + "\n", + " for video_idx, frame_idx, prob in zip(\n", + " batch[\"video_id\"], batch[\"frame_idx\"], probs\n", + " ):\n", + " # print(type(video_id.item()), type(frame_idx.item()), type(prob.item()))\n", + " video_id = video_infos[video_idx.item()].video_id\n", "\n", - " for sample, prob in zip(batch, probs):\n", - " video_id = sample[\"video_id\"]\n", - " frame_idx = sample[\"frame_idx\"]\n", " clip_results_dict[\"video_id\"].append(video_id)\n", - " clip_results_dict[\"frame_idx\"].append(frame_idx)\n", + " clip_results_dict[\"frame_idx\"].append(frame_idx.item())\n", " clip_results_dict[\"probs\"].append(prob.item())\n", "\n", + " # print(f\"Len clip results: {len(clip_results_dict['video_id'])}\")\n", + "\n", + " # print(\"Outputs parsed\")\n", + "\n", + " print(\"All frames processed\")\n", " clip_results = pd.DataFrame(clip_results_dict)\n", + " print(\"Dataframe created\")\n", + " print(clip_results)\n", "\n", " max_gap_seconds = 1\n", " fps_sampling = 1\n", @@ -497,7 +591,8 @@ " min_segment_seconds = 3\n", " fps = 25\n", "\n", - " for video_clip_results in clip_results.groupby(\"video_id\"):\n", + " segment_infos = []\n", + " for video_id, video_clip_results in clip_results.groupby(\"video_id\"):\n", " probs = video_clip_results[\"probs\"].values\n", " probs = smoother.smooth(probs).smooth_data[0]\n", " segments_start_end = get_segments(\n", @@ -506,18 +601,25 @@ " min_prob=min_prob,\n", " min_segment=round(min_segment_seconds * fps_sampling),\n", " )\n", - " segments = []\n", + "\n", + " print(f\"Segments for video {video_id}: {segments_start_end}\")\n", + "\n", + " sec2ts = lambda s: time.strftime(\n", + " f\"%H:%M:%S.{round((s%1)*1000):03d}\", time.gmtime(s)\n", + " )\n", + "\n", + " \n", " for start, end in segments_start_end:\n", - " segments.append(\n", - " SegmentInfo.from_seconds(\n", - " start,\n", - " end,\n", + " segment_infos.append(\n", + " SegmentInfo(\n", + " start_timestamp=sec2ts(start),\n", + " end_timestamp=sec2ts(end),\n", " fps=fps,\n", - " video_id=\"a\",\n", + " video_id=video_id,\n", " )\n", " )\n", "\n", - " return {\"segments\": segments}\n", + " return {\"segment_infos\": segment_infos}\n", "\n", "\n", "# def extract_clues_node(state: AgentState):\n", @@ -650,69 +752,253 @@ "text": [ "{'generate_queries': {'search_queries': ['how to do squats', 'squat exercise tutorial']}}\n", "{'get_video_ids': {'video_ids': ['xqvCmoLULNY', 'IB_icWRzi4E']}}\n", - "[youtube] Extracting URL: https://www.youtube.com/watch?v=xqvCmoLULNY\n", - "[youtube] xqvCmoLULNY: Downloading webpage\n", - "[youtube] xqvCmoLULNY: Downloading ios player API JSON\n", - "[youtube] xqvCmoLULNY: Downloading web creator player API JSON\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "ERROR: [youtube] xqvCmoLULNY: Sign in to confirm you’re not a bot. This helps protect our community. Learn more\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-08-22 08:30:23.552248 Error at video xqvCmoLULNY, skipping\n", - "2024-08-22 08:30:23.552297 ERROR: [youtube] xqvCmoLULNY: Sign in to confirm you’re not a bot. This helps protect our community. Learn more\n", - "[youtube] Extracting URL: https://www.youtube.com/watch?v=IB_icWRzi4E\n", - "[youtube] IB_icWRzi4E: Downloading webpage\n", - "[youtube] IB_icWRzi4E: Downloading ios player API JSON\n", - "[youtube] IB_icWRzi4E: Downloading web creator player API JSON\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "ERROR: [youtube] IB_icWRzi4E: Sign in to confirm you’re not a bot. This helps protect our community. Learn more\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-08-22 08:30:24.139454 Error at video IB_icWRzi4E, skipping\n", - "2024-08-22 08:30:24.139505 ERROR: [youtube] IB_icWRzi4E: Sign in to confirm you’re not a bot. This helps protect our community. Learn more\n", - "{'download': {'video_infos': []}}\n" - ] - }, - { - "ename": "KeyError", - "evalue": "'video_id'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[13], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m thread \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mconfigurable\u001b[39m\u001b[38;5;124m\"\u001b[39m: {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mthread_id\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m1\u001b[39m\u001b[38;5;124m\"\u001b[39m}}\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43ms\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mgraph\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstream\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[43m \u001b[49m\u001b[43m{\u001b[49m\n\u001b[1;32m 4\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtask\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mi wanna teach people how to do squats\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[43m \u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[43m \u001b[49m\u001b[43mthread\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mprint\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43ms\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/.conda/envs/databuilder_agent/lib/python3.12/site-packages/langgraph/pregel/__init__.py:1029\u001b[0m, in \u001b[0;36mPregel.stream\u001b[0;34m(self, input, config, stream_mode, output_keys, interrupt_before, interrupt_after, debug)\u001b[0m\n\u001b[1;32m 1026\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m 1028\u001b[0m \u001b[38;5;66;03m# panic on failure or timeout\u001b[39;00m\n\u001b[0;32m-> 1029\u001b[0m \u001b[43m_panic_or_proceed\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdone\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minflight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mloop\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstep\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1030\u001b[0m \u001b[38;5;66;03m# don't keep futures around in memory longer than needed\u001b[39;00m\n\u001b[1;32m 1031\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m done, inflight, futures\n", - "File \u001b[0;32m~/.conda/envs/databuilder_agent/lib/python3.12/site-packages/langgraph/pregel/__init__.py:1456\u001b[0m, in \u001b[0;36m_panic_or_proceed\u001b[0;34m(done, inflight, step, timeout_exc_cls)\u001b[0m\n\u001b[1;32m 1454\u001b[0m inflight\u001b[38;5;241m.\u001b[39mpop()\u001b[38;5;241m.\u001b[39mcancel()\n\u001b[1;32m 1455\u001b[0m \u001b[38;5;66;03m# raise the exception\u001b[39;00m\n\u001b[0;32m-> 1456\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exc\n\u001b[1;32m 1458\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inflight:\n\u001b[1;32m 1459\u001b[0m \u001b[38;5;66;03m# if we got here means we timed out\u001b[39;00m\n\u001b[1;32m 1460\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m inflight:\n\u001b[1;32m 1461\u001b[0m \u001b[38;5;66;03m# cancel all pending tasks\u001b[39;00m\n", - "File \u001b[0;32m~/.conda/envs/databuilder_agent/lib/python3.12/site-packages/langgraph/pregel/executor.py:60\u001b[0m, in \u001b[0;36mBackgroundExecutor.done\u001b[0;34m(self, task)\u001b[0m\n\u001b[1;32m 58\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdone\u001b[39m(\u001b[38;5;28mself\u001b[39m, task: concurrent\u001b[38;5;241m.\u001b[39mfutures\u001b[38;5;241m.\u001b[39mFuture) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 59\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m---> 60\u001b[0m \u001b[43mtask\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 61\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m GraphInterrupt:\n\u001b[1;32m 62\u001b[0m \u001b[38;5;66;03m# This exception is an interruption signal, not an error\u001b[39;00m\n\u001b[1;32m 63\u001b[0m \u001b[38;5;66;03m# so we don't want to re-raise it on exit\u001b[39;00m\n\u001b[1;32m 64\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtasks\u001b[38;5;241m.\u001b[39mpop(task)\n", - "File \u001b[0;32m~/.conda/envs/databuilder_agent/lib/python3.12/concurrent/futures/_base.py:449\u001b[0m, in \u001b[0;36mFuture.result\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 447\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m CancelledError()\n\u001b[1;32m 448\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_state \u001b[38;5;241m==\u001b[39m FINISHED:\n\u001b[0;32m--> 449\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__get_result\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 451\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_condition\u001b[38;5;241m.\u001b[39mwait(timeout)\n\u001b[1;32m 453\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_state \u001b[38;5;129;01min\u001b[39;00m [CANCELLED, CANCELLED_AND_NOTIFIED]:\n", - "File \u001b[0;32m~/.conda/envs/databuilder_agent/lib/python3.12/concurrent/futures/_base.py:401\u001b[0m, in \u001b[0;36mFuture.__get_result\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 399\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_exception:\n\u001b[1;32m 400\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 401\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_exception\n\u001b[1;32m 402\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 403\u001b[0m \u001b[38;5;66;03m# Break a reference cycle with the exception in self._exception\u001b[39;00m\n\u001b[1;32m 404\u001b[0m \u001b[38;5;28mself\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", - "File \u001b[0;32m~/.conda/envs/databuilder_agent/lib/python3.12/concurrent/futures/thread.py:58\u001b[0m, in \u001b[0;36m_WorkItem.run\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[1;32m 57\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m---> 58\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 59\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mBaseException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 60\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfuture\u001b[38;5;241m.\u001b[39mset_exception(exc)\n", - "File \u001b[0;32m~/.conda/envs/databuilder_agent/lib/python3.12/site-packages/langgraph/pregel/retry.py:25\u001b[0m, in \u001b[0;36mrun_with_retry\u001b[0;34m(task, retry_policy)\u001b[0m\n\u001b[1;32m 23\u001b[0m task\u001b[38;5;241m.\u001b[39mwrites\u001b[38;5;241m.\u001b[39mclear()\n\u001b[1;32m 24\u001b[0m \u001b[38;5;66;03m# run the task\u001b[39;00m\n\u001b[0;32m---> 25\u001b[0m \u001b[43mtask\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mproc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minvoke\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtask\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minput\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtask\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 26\u001b[0m \u001b[38;5;66;03m# if successful, end\u001b[39;00m\n\u001b[1;32m 27\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n", - "File \u001b[0;32m~/.conda/envs/databuilder_agent/lib/python3.12/site-packages/langchain_core/runnables/base.py:2876\u001b[0m, in \u001b[0;36mRunnableSequence.invoke\u001b[0;34m(self, input, config, **kwargs)\u001b[0m\n\u001b[1;32m 2874\u001b[0m context\u001b[38;5;241m.\u001b[39mrun(_set_config_context, config)\n\u001b[1;32m 2875\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m i \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[0;32m-> 2876\u001b[0m \u001b[38;5;28minput\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[43mcontext\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstep\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minvoke\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2877\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 2878\u001b[0m \u001b[38;5;28minput\u001b[39m \u001b[38;5;241m=\u001b[39m context\u001b[38;5;241m.\u001b[39mrun(step\u001b[38;5;241m.\u001b[39minvoke, \u001b[38;5;28minput\u001b[39m, config)\n", - "File \u001b[0;32m~/.conda/envs/databuilder_agent/lib/python3.12/site-packages/langgraph/utils.py:102\u001b[0m, in \u001b[0;36mRunnableCallable.invoke\u001b[0;34m(self, input, config, **kwargs)\u001b[0m\n\u001b[1;32m 100\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m accepts_config(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfunc):\n\u001b[1;32m 101\u001b[0m kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mconfig\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m config\n\u001b[0;32m--> 102\u001b[0m ret \u001b[38;5;241m=\u001b[39m \u001b[43mcontext\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfunc\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 103\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(ret, Runnable) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrecurse:\n\u001b[1;32m 104\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ret\u001b[38;5;241m.\u001b[39minvoke(\u001b[38;5;28minput\u001b[39m, config)\n", - "Cell \u001b[0;32mIn[10], line 184\u001b[0m, in \u001b[0;36mdetect_segments_node\u001b[0;34m(state)\u001b[0m\n\u001b[1;32m 181\u001b[0m min_segment_seconds \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m3\u001b[39m\n\u001b[1;32m 182\u001b[0m fps \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m25\u001b[39m\n\u001b[0;32m--> 184\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m video_clip_results \u001b[38;5;129;01min\u001b[39;00m \u001b[43mclip_results\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgroupby\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mvideo_id\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 185\u001b[0m probs \u001b[38;5;241m=\u001b[39m video_clip_results[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mprobs\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39mvalues\n\u001b[1;32m 186\u001b[0m probs \u001b[38;5;241m=\u001b[39m smoother\u001b[38;5;241m.\u001b[39msmooth(probs)\u001b[38;5;241m.\u001b[39msmooth_data[\u001b[38;5;241m0\u001b[39m]\n", - "File \u001b[0;32m~/.conda/envs/databuilder_agent/lib/python3.12/site-packages/pandas/core/frame.py:9183\u001b[0m, in \u001b[0;36mDataFrame.groupby\u001b[0;34m(self, by, axis, level, as_index, sort, group_keys, observed, dropna)\u001b[0m\n\u001b[1;32m 9180\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m level \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m by \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 9181\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mYou have to supply one of \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mby\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m and \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlevel\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m-> 9183\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mDataFrameGroupBy\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 9184\u001b[0m \u001b[43m \u001b[49m\u001b[43mobj\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9185\u001b[0m \u001b[43m \u001b[49m\u001b[43mkeys\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mby\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9186\u001b[0m \u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9187\u001b[0m \u001b[43m \u001b[49m\u001b[43mlevel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlevel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9188\u001b[0m \u001b[43m \u001b[49m\u001b[43mas_index\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mas_index\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9189\u001b[0m \u001b[43m \u001b[49m\u001b[43msort\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msort\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9190\u001b[0m \u001b[43m \u001b[49m\u001b[43mgroup_keys\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup_keys\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9191\u001b[0m \u001b[43m \u001b[49m\u001b[43mobserved\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mobserved\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9192\u001b[0m \u001b[43m \u001b[49m\u001b[43mdropna\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdropna\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9193\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/.conda/envs/databuilder_agent/lib/python3.12/site-packages/pandas/core/groupby/groupby.py:1329\u001b[0m, in \u001b[0;36mGroupBy.__init__\u001b[0;34m(self, obj, keys, axis, level, grouper, exclusions, selection, as_index, sort, group_keys, observed, dropna)\u001b[0m\n\u001b[1;32m 1326\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdropna \u001b[38;5;241m=\u001b[39m dropna\n\u001b[1;32m 1328\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m grouper \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m-> 1329\u001b[0m grouper, exclusions, obj \u001b[38;5;241m=\u001b[39m \u001b[43mget_grouper\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1330\u001b[0m \u001b[43m \u001b[49m\u001b[43mobj\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1331\u001b[0m \u001b[43m \u001b[49m\u001b[43mkeys\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1332\u001b[0m \u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1333\u001b[0m \u001b[43m \u001b[49m\u001b[43mlevel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlevel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1334\u001b[0m \u001b[43m \u001b[49m\u001b[43msort\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msort\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1335\u001b[0m \u001b[43m \u001b[49m\u001b[43mobserved\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mobserved\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mis\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mlib\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mno_default\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mobserved\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1336\u001b[0m \u001b[43m \u001b[49m\u001b[43mdropna\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdropna\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1337\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1339\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m observed \u001b[38;5;129;01mis\u001b[39;00m lib\u001b[38;5;241m.\u001b[39mno_default:\n\u001b[1;32m 1340\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28many\u001b[39m(ping\u001b[38;5;241m.\u001b[39m_passed_categorical \u001b[38;5;28;01mfor\u001b[39;00m ping \u001b[38;5;129;01min\u001b[39;00m grouper\u001b[38;5;241m.\u001b[39mgroupings):\n", - "File \u001b[0;32m~/.conda/envs/databuilder_agent/lib/python3.12/site-packages/pandas/core/groupby/grouper.py:1043\u001b[0m, in \u001b[0;36mget_grouper\u001b[0;34m(obj, key, axis, level, sort, observed, validate, dropna)\u001b[0m\n\u001b[1;32m 1041\u001b[0m in_axis, level, gpr \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m, gpr, \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1042\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1043\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(gpr)\n\u001b[1;32m 1044\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(gpr, Grouper) \u001b[38;5;129;01mand\u001b[39;00m gpr\u001b[38;5;241m.\u001b[39mkey \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1045\u001b[0m \u001b[38;5;66;03m# Add key to exclusions\u001b[39;00m\n\u001b[1;32m 1046\u001b[0m exclusions\u001b[38;5;241m.\u001b[39madd(gpr\u001b[38;5;241m.\u001b[39mkey)\n", - "\u001b[0;31mKeyError\u001b[0m: 'video_id'" + "Downloaded video ids: ['IB_icWRzi4E', 'xqvCmoLULNY']\n", + "{'download': {'video_infos': [VideoInfo(video_id='xqvCmoLULNY', url='https://www.youtube.com/watch?v=xqvCmoLULNY', relative_video_path='videos/xqvCmoLULNY.mp4', subs=\"WEBVTT\\nKind: captions\\nLanguage: en\\n\\n00:00:00.160 --> 00:00:01.829 align:start position:0%\\n \\nlet's<00:00:00.399> learn<00:00:00.560> how<00:00:00.719> to<00:00:00.880> properly<00:00:01.280> perform<00:00:01.760> a\\n\\n00:00:01.829 --> 00:00:01.839 align:start position:0%\\nlet's learn how to properly perform a\\n \\n\\n00:00:01.839 --> 00:00:02.790 align:start position:0%\\nlet's learn how to properly perform a\\nsquat\\n\\n00:00:02.790 --> 00:00:02.800 align:start position:0%\\nsquat\\n \\n\\n00:00:02.800 --> 00:00:04.470 align:start position:0%\\nsquat\\nstart<00:00:03.120> with<00:00:03.199> your<00:00:03.360> feet<00:00:03.679> slightly<00:00:04.080> wider<00:00:04.319> than\\n\\n00:00:04.470 --> 00:00:04.480 align:start position:0%\\nstart with your feet slightly wider than\\n \\n\\n00:00:04.480 --> 00:00:06.389 align:start position:0%\\nstart with your feet slightly wider than\\nshoulder<00:00:04.799> width<00:00:05.120> apart\\n\\n00:00:06.389 --> 00:00:06.399 align:start position:0%\\nshoulder width apart\\n \\n\\n00:00:06.399 --> 00:00:09.190 align:start position:0%\\nshoulder width apart\\ncross<00:00:06.799> your<00:00:07.040> arms<00:00:07.440> in<00:00:07.759> front\\n\\n00:00:09.190 --> 00:00:09.200 align:start position:0%\\ncross your arms in front\\n \\n\\n00:00:09.200 --> 00:00:11.270 align:start position:0%\\ncross your arms in front\\nso<00:00:09.440> touch<00:00:09.679> your<00:00:09.920> right<00:00:10.240> hand<00:00:10.559> to<00:00:10.719> your<00:00:10.960> left\\n\\n00:00:11.270 --> 00:00:11.280 align:start position:0%\\nso touch your right hand to your left\\n \\n\\n00:00:11.280 --> 00:00:13.350 align:start position:0%\\nso touch your right hand to your left\\nshoulder<00:00:12.080> and<00:00:12.320> vice<00:00:12.559> versa<00:00:13.040> pointing<00:00:13.280> your\\n\\n00:00:13.350 --> 00:00:13.360 align:start position:0%\\nshoulder and vice versa pointing your\\n \\n\\n00:00:13.360 --> 00:00:15.190 align:start position:0%\\nshoulder and vice versa pointing your\\nelbows<00:00:13.679> straight<00:00:13.920> ahead<00:00:14.559> now<00:00:14.719> from<00:00:14.960> here\\n\\n00:00:15.190 --> 00:00:15.200 align:start position:0%\\nelbows straight ahead now from here\\n \\n\\n00:00:15.200 --> 00:00:17.109 align:start position:0%\\nelbows straight ahead now from here\\nshift<00:00:15.440> your<00:00:15.679> weight<00:00:16.160> to<00:00:16.320> the<00:00:16.480> ball<00:00:16.720> of<00:00:16.880> your\\n\\n00:00:17.109 --> 00:00:17.119 align:start position:0%\\nshift your weight to the ball of your\\n \\n\\n00:00:17.119 --> 00:00:18.230 align:start position:0%\\nshift your weight to the ball of your\\nfeet\\n\\n00:00:18.230 --> 00:00:18.240 align:start position:0%\\nfeet\\n \\n\\n00:00:18.240 --> 00:00:21.109 align:start position:0%\\nfeet\\nand<00:00:18.400> bend<00:00:18.720> your<00:00:18.880> knees\\n\\n00:00:21.109 --> 00:00:21.119 align:start position:0%\\nand bend your knees\\n \\n\\n00:00:21.119 --> 00:00:23.189 align:start position:0%\\nand bend your knees\\nget<00:00:21.359> as<00:00:21.520> close<00:00:21.680> to<00:00:21.840> 90<00:00:22.160> degrees<00:00:22.480> as<00:00:22.640> you<00:00:22.800> can\\n\\n00:00:23.189 --> 00:00:23.199 align:start position:0%\\nget as close to 90 degrees as you can\\n \\n\\n00:00:23.199 --> 00:00:25.830 align:start position:0%\\nget as close to 90 degrees as you can\\nlooking<00:00:23.519> straight<00:00:23.840> ahead<00:00:24.480> and<00:00:24.720> from<00:00:24.960> here\\n\\n00:00:25.830 --> 00:00:25.840 align:start position:0%\\nlooking straight ahead and from here\\n \\n\\n00:00:25.840 --> 00:00:29.109 align:start position:0%\\nlooking straight ahead and from here\\npush<00:00:26.160> back<00:00:26.400> up<00:00:26.480> to<00:00:26.640> the<00:00:26.720> starting<00:00:27.039> position\\n\\n00:00:29.109 --> 00:00:29.119 align:start position:0%\\npush back up to the starting position\\n \\n\\n00:00:29.119 --> 00:00:30.150 align:start position:0%\\npush back up to the starting position\\nthis<00:00:29.359> is<00:00:29.439> going<00:00:29.599> to<00:00:29.679> be<00:00:29.760> great<00:00:30.000> for\\n\\n00:00:30.150 --> 00:00:30.160 align:start position:0%\\nthis is going to be great for\\n \\n\\n00:00:30.160 --> 00:00:31.830 align:start position:0%\\nthis is going to be great for\\nstrengthening\\n\\n00:00:31.830 --> 00:00:31.840 align:start position:0%\\nstrengthening\\n \\n\\n00:00:31.840 --> 00:00:34.069 align:start position:0%\\nstrengthening\\nyour<00:00:32.079> thighs<00:00:32.480> or<00:00:32.559> your<00:00:32.719> quadriceps<00:00:33.680> as<00:00:33.840> well\\n\\n00:00:34.069 --> 00:00:34.079 align:start position:0%\\nyour thighs or your quadriceps as well\\n \\n\\n00:00:34.079 --> 00:00:36.150 align:start position:0%\\nyour thighs or your quadriceps as well\\nas<00:00:34.239> your<00:00:34.480> butt<00:00:34.719> or<00:00:34.800> your<00:00:34.960> glutes\\n\\n00:00:36.150 --> 00:00:36.160 align:start position:0%\\nas your butt or your glutes\\n \\n\\n00:00:36.160 --> 00:00:37.750 align:start position:0%\\nas your butt or your glutes\\nand<00:00:36.399> it's<00:00:36.559> also<00:00:36.800> going<00:00:36.800> to<00:00:36.880> be<00:00:37.040> great<00:00:37.280> to<00:00:37.440> work\\n\\n00:00:37.750 --> 00:00:37.760 align:start position:0%\\nand it's also going to be great to work\\n \\n\\n00:00:37.760 --> 00:00:41.510 align:start position:0%\\nand it's also going to be great to work\\non<00:00:38.079> your<00:00:38.320> posture\\n\\n00:00:41.510 --> 00:00:41.520 align:start position:0%\\n \\n \\n\\n00:00:41.520 --> 00:00:45.840 align:start position:0%\\n \\nthat's<00:00:41.760> how<00:00:41.920> to<00:00:42.000> properly<00:00:42.399> perform<00:00:42.879> a<00:00:42.960> squat\\n\\n\", transcript=\"00:00:00\\nlet's learn how to properly perform a\\n00:00:01\\nsquat\\n00:00:02\\nstart with your feet slightly wider than\\n00:00:04\\nshoulder width apart\\n00:00:06\\ncross your arms in front\\n00:00:09\\nso touch your right hand to your left\\n00:00:11\\nshoulder and vice versa pointing your\\n00:00:13\\nelbows straight ahead now from here\\n00:00:15\\nshift your weight to the ball of your\\n00:00:17\\nfeet\\n00:00:18\\nand bend your knees\\n00:00:21\\nget as close to 90 degrees as you can\\n00:00:23\\nlooking straight ahead and from here\\n00:00:25\\npush back up to the starting position\\n00:00:29\\nthis is going to be great for\\n00:00:30\\nstrengthening\\n00:00:31\\nyour thighs or your quadriceps as well\\n00:00:34\\nas your butt or your glutes\\n00:00:36\\nand it's also going to be great to work\\n00:00:37\\non your posture\\n00:00:41\\nthat's how to properly perform a squat\"), VideoInfo(video_id='IB_icWRzi4E', url='https://www.youtube.com/watch?v=IB_icWRzi4E', relative_video_path='videos/IB_icWRzi4E.mp4', subs=\"WEBVTT\\nKind: captions\\nLanguage: en\\n\\n00:00:02.419 --> 00:00:05.150 align:start position:0%\\n \\nhi<00:00:03.419> I'm<00:00:03.810> Roger<00:00:03.990> Frampton<00:00:04.319> and<00:00:04.770> I'm<00:00:04.859> a<00:00:04.950> movement\\n\\n00:00:05.150 --> 00:00:05.160 align:start position:0%\\nhi I'm Roger Frampton and I'm a movement\\n \\n\\n00:00:05.160 --> 00:00:07.400 align:start position:0%\\nhi I'm Roger Frampton and I'm a movement\\ncoach<00:00:05.520> from<00:00:05.910> London<00:00:06.299> and<00:00:06.509> I'm<00:00:06.779> author<00:00:07.049> of<00:00:07.230> the\\n\\n00:00:07.400 --> 00:00:07.410 align:start position:0%\\ncoach from London and I'm author of the\\n \\n\\n00:00:07.410 --> 00:00:10.310 align:start position:0%\\ncoach from London and I'm author of the\\nbook<00:00:07.649> the<00:00:08.010> flexible<00:00:08.400> body<00:00:08.660> so<00:00:09.660> this<00:00:09.840> position\\n\\n00:00:10.310 --> 00:00:10.320 align:start position:0%\\nbook the flexible body so this position\\n \\n\\n00:00:10.320 --> 00:00:11.780 align:start position:0%\\nbook the flexible body so this position\\nis<00:00:10.860> the<00:00:11.099> squat\\n\\n00:00:11.780 --> 00:00:11.790 align:start position:0%\\nis the squat\\n \\n\\n00:00:11.790 --> 00:00:13.459 align:start position:0%\\nis the squat\\nmost<00:00:12.150> people<00:00:12.480> when<00:00:12.599> I<00:00:12.660> talk<00:00:12.870> about<00:00:12.990> the<00:00:13.200> squat\\n\\n00:00:13.459 --> 00:00:13.469 align:start position:0%\\nmost people when I talk about the squat\\n \\n\\n00:00:13.469 --> 00:00:14.780 align:start position:0%\\nmost people when I talk about the squat\\nthinking<00:00:13.799> when<00:00:13.920> I'm<00:00:14.009> forming<00:00:14.340> up<00:00:14.460> and<00:00:14.639> down\\n\\n00:00:14.780 --> 00:00:14.790 align:start position:0%\\nthinking when I'm forming up and down\\n \\n\\n00:00:14.790 --> 00:00:16.790 align:start position:0%\\nthinking when I'm forming up and down\\nexercise<00:00:15.299> it<00:00:15.509> works<00:00:15.750> your<00:00:15.960> legs<00:00:16.080> and<00:00:16.350> bar\\n\\n00:00:16.790 --> 00:00:16.800 align:start position:0%\\nexercise it works your legs and bar\\n \\n\\n00:00:16.800 --> 00:00:19.670 align:start position:0%\\nexercise it works your legs and bar\\nwell<00:00:17.640> the<00:00:17.789> squat<00:00:18.090> is<00:00:18.210> actually<00:00:18.570> a<00:00:18.680> position\\n\\n00:00:19.670 --> 00:00:19.680 align:start position:0%\\nwell the squat is actually a position\\n \\n\\n00:00:19.680 --> 00:00:22.609 align:start position:0%\\nwell the squat is actually a position\\nthat<00:00:20.520> we're<00:00:20.730> designed<00:00:21.029> to<00:00:21.180> defecate<00:00:21.660> in<00:00:21.689> every\\n\\n00:00:22.609 --> 00:00:22.619 align:start position:0%\\nthat we're designed to defecate in every\\n \\n\\n00:00:22.619 --> 00:00:25.810 align:start position:0%\\nthat we're designed to defecate in every\\nkid<00:00:22.830> sits<00:00:23.670> and<00:00:24.000> rests<00:00:24.600> in<00:00:24.900> this<00:00:25.140> position<00:00:25.560> and\\n\\n00:00:25.810 --> 00:00:25.820 align:start position:0%\\nkid sits and rests in this position and\\n \\n\\n00:00:25.820 --> 00:00:28.700 align:start position:0%\\nkid sits and rests in this position and\\nif<00:00:26.820> we<00:00:26.910> look<00:00:27.119> at<00:00:27.240> Western<00:00:27.480> people<00:00:28.050> I<00:00:28.230> think\\n\\n00:00:28.700 --> 00:00:28.710 align:start position:0%\\nif we look at Western people I think\\n \\n\\n00:00:28.710 --> 00:00:31.550 align:start position:0%\\nif we look at Western people I think\\nmost<00:00:28.890> people<00:00:29.400> end<00:00:30.029> up<00:00:30.240> sitting<00:00:30.510> up<00:00:31.199> on<00:00:31.410> their\\n\\n00:00:31.550 --> 00:00:31.560 align:start position:0%\\nmost people end up sitting up on their\\n \\n\\n00:00:31.560 --> 00:00:33.740 align:start position:0%\\nmost people end up sitting up on their\\ntoes<00:00:31.740> more<00:00:32.070> in<00:00:32.219> this<00:00:32.309> position<00:00:32.759> while<00:00:33.660> I'm\\n\\n00:00:33.740 --> 00:00:33.750 align:start position:0%\\ntoes more in this position while I'm\\n \\n\\n00:00:33.750 --> 00:00:35.510 align:start position:0%\\ntoes more in this position while I'm\\nadvising<00:00:34.290> is<00:00:34.410> that<00:00:34.530> you<00:00:34.649> get<00:00:34.890> this<00:00:35.070> position\\n\\n00:00:35.510 --> 00:00:35.520 align:start position:0%\\nadvising is that you get this position\\n \\n\\n00:00:35.520 --> 00:00:38.150 align:start position:0%\\nadvising is that you get this position\\nback<00:00:35.730> not<00:00:36.510> for<00:00:36.719> an<00:00:36.809> exercise<00:00:37.260> necessarily<00:00:38.010> not\\n\\n00:00:38.150 --> 00:00:38.160 align:start position:0%\\nback not for an exercise necessarily not\\n \\n\\n00:00:38.160 --> 00:00:40.610 align:start position:0%\\nback not for an exercise necessarily not\\nbecause<00:00:38.489> of<00:00:38.520> fitness<00:00:38.969> or<00:00:39.149> to<00:00:39.270> be<00:00:39.420> fitter<00:00:39.719> but\\n\\n00:00:40.610 --> 00:00:40.620 align:start position:0%\\nbecause of fitness or to be fitter but\\n \\n\\n00:00:40.620 --> 00:00:43.430 align:start position:0%\\nbecause of fitness or to be fitter but\\njust<00:00:40.920> because<00:00:41.520> you're<00:00:41.730> designed<00:00:42.120> to<00:00:43.110> do<00:00:43.260> it\\n\\n00:00:43.430 --> 00:00:43.440 align:start position:0%\\njust because you're designed to do it\\n \\n\\n00:00:43.440 --> 00:00:45.350 align:start position:0%\\njust because you're designed to do it\\nthis<00:00:43.710> here<00:00:43.980> is<00:00:44.219> just<00:00:44.399> the<00:00:44.610> human<00:00:44.789> resting\\n\\n00:00:45.350 --> 00:00:45.360 align:start position:0%\\nthis here is just the human resting\\n \\n\\n00:00:45.360 --> 00:00:48.229 align:start position:0%\\nthis here is just the human resting\\nposition<00:00:45.780> so<00:00:46.230> when<00:00:46.379> I<00:00:46.410> run<00:00:46.649> classes<00:00:47.250> people\\n\\n00:00:48.229 --> 00:00:48.239 align:start position:0%\\nposition so when I run classes people\\n \\n\\n00:00:48.239 --> 00:00:50.119 align:start position:0%\\nposition so when I run classes people\\nalways<00:00:48.480> talk<00:00:49.140> about<00:00:49.170> the<00:00:49.469> squat<00:00:49.770> how<00:00:49.980> they\\n\\n00:00:50.119 --> 00:00:50.129 align:start position:0%\\nalways talk about the squat how they\\n \\n\\n00:00:50.129 --> 00:00:51.590 align:start position:0%\\nalways talk about the squat how they\\ncan't<00:00:50.399> quite<00:00:50.610> get<00:00:50.789> their<00:00:50.969> heel<00:00:51.149> down<00:00:51.390> on<00:00:51.510> the\\n\\n00:00:51.590 --> 00:00:51.600 align:start position:0%\\ncan't quite get their heel down on the\\n \\n\\n00:00:51.600 --> 00:00:53.569 align:start position:0%\\ncan't quite get their heel down on the\\nfloor<00:00:51.840> now<00:00:52.649> the<00:00:52.739> reason<00:00:53.070> for<00:00:53.250> that<00:00:53.309> is<00:00:53.550> because\\n\\n00:00:53.569 --> 00:00:53.579 align:start position:0%\\nfloor now the reason for that is because\\n \\n\\n00:00:53.579 --> 00:00:55.220 align:start position:0%\\nfloor now the reason for that is because\\nsince<00:00:54.059> about<00:00:54.239> the<00:00:54.300> age<00:00:54.420> of<00:00:54.600> four<00:00:54.840> years<00:00:55.050> old\\n\\n00:00:55.220 --> 00:00:55.230 align:start position:0%\\nsince about the age of four years old\\n \\n\\n00:00:55.230 --> 00:00:58.099 align:start position:0%\\nsince about the age of four years old\\nyou've<00:00:55.800> been<00:00:56.010> wearing<00:00:56.219> shoes<00:00:56.809> we<00:00:57.809> are<00:00:57.960> an\\n\\n00:00:58.099 --> 00:00:58.109 align:start position:0%\\nyou've been wearing shoes we are an\\n \\n\\n00:00:58.109 --> 00:00:58.549 align:start position:0%\\nyou've been wearing shoes we are an\\nanimal\\n\\n00:00:58.549 --> 00:00:58.559 align:start position:0%\\nanimal\\n \\n\\n00:00:58.559 --> 00:01:00.229 align:start position:0%\\nanimal\\nwe're<00:00:59.070> designed<00:00:59.370> to<00:00:59.460> walk<00:00:59.640> around<00:00:59.789> on<00:01:00.059> bare\\n\\n00:01:00.229 --> 00:01:00.239 align:start position:0%\\nwe're designed to walk around on bare\\n \\n\\n00:01:00.239 --> 00:01:02.510 align:start position:0%\\nwe're designed to walk around on bare\\nfeet<00:01:00.270> and<00:01:00.690> the<00:01:01.109> reason<00:01:01.469> that<00:01:01.559> we'll<00:01:01.710> lose<00:01:01.949> our\\n\\n00:01:02.510 --> 00:01:02.520 align:start position:0%\\nfeet and the reason that we'll lose our\\n \\n\\n00:01:02.520 --> 00:01:05.210 align:start position:0%\\nfeet and the reason that we'll lose our\\nsquat<00:01:03.090> mobility<00:01:03.570> is<00:01:04.229> simply<00:01:04.619> because<00:01:04.920> we<00:01:05.040> come\\n\\n00:01:05.210 --> 00:01:05.220 align:start position:0%\\nsquat mobility is simply because we come\\n \\n\\n00:01:05.220 --> 00:01:07.370 align:start position:0%\\nsquat mobility is simply because we come\\nto<00:01:05.369> become<00:01:05.610> tight<00:01:05.970> in<00:01:06.150> our<00:01:06.210> ankles<00:01:06.659> and\\n\\n00:01:07.370 --> 00:01:07.380 align:start position:0%\\nto become tight in our ankles and\\n \\n\\n00:01:07.380 --> 00:01:09.590 align:start position:0%\\nto become tight in our ankles and\\ntherefore<00:01:08.070> you<00:01:08.670> can't<00:01:08.970> get<00:01:09.090> all<00:01:09.210> the<00:01:09.270> way<00:01:09.360> down\\n\\n00:01:09.590 --> 00:01:09.600 align:start position:0%\\ntherefore you can't get all the way down\\n \\n\\n00:01:09.600 --> 00:01:11.990 align:start position:0%\\ntherefore you can't get all the way down\\nto<00:01:09.780> the<00:01:09.869> bottom<00:01:10.010> now<00:01:11.010> all<00:01:11.369> you<00:01:11.490> need<00:01:11.640> to<00:01:11.729> do<00:01:11.880> is\\n\\n00:01:11.990 --> 00:01:12.000 align:start position:0%\\nto the bottom now all you need to do is\\n \\n\\n00:01:12.000 --> 00:01:14.030 align:start position:0%\\nto the bottom now all you need to do is\\nif<00:01:12.240> you<00:01:12.390> put<00:01:12.570> your<00:01:12.600> heels<00:01:12.990> on<00:01:13.200> a<00:01:13.229> yoga<00:01:13.590> block<00:01:13.799> or\\n\\n00:01:14.030 --> 00:01:14.040 align:start position:0%\\nif you put your heels on a yoga block or\\n \\n\\n00:01:14.040 --> 00:01:16.580 align:start position:0%\\nif you put your heels on a yoga block or\\na<00:01:14.100> book<00:01:14.400> you're<00:01:15.000> in<00:01:15.119> that<00:01:15.270> position<00:01:15.590> rather\\n\\n00:01:16.580 --> 00:01:16.590 align:start position:0%\\na book you're in that position rather\\n \\n\\n00:01:16.590 --> 00:01:19.070 align:start position:0%\\na book you're in that position rather\\nthan<00:01:16.799> that<00:01:16.890> position<00:01:17.430> that<00:01:18.330> will<00:01:18.600> take<00:01:18.900> away\\n\\n00:01:19.070 --> 00:01:19.080 align:start position:0%\\nthan that position that will take away\\n \\n\\n00:01:19.080 --> 00:01:21.050 align:start position:0%\\nthan that position that will take away\\nyour<00:01:19.320> ankle<00:01:19.680> mobility<00:01:19.770> and<00:01:20.280> allow<00:01:20.549> you<00:01:20.610> to<00:01:20.909> get\\n\\n00:01:21.050 --> 00:01:21.060 align:start position:0%\\nyour ankle mobility and allow you to get\\n \\n\\n00:01:21.060 --> 00:01:23.149 align:start position:0%\\nyour ankle mobility and allow you to get\\nall<00:01:21.210> the<00:01:21.330> way<00:01:21.450> down<00:01:21.689> now<00:01:22.500> you<00:01:22.560> can<00:01:22.770> use<00:01:22.920> a<00:01:22.950> yoga\\n\\n00:01:23.149 --> 00:01:23.159 align:start position:0%\\nall the way down now you can use a yoga\\n \\n\\n00:01:23.159 --> 00:01:24.950 align:start position:0%\\nall the way down now you can use a yoga\\nblock<00:01:23.490> or<00:01:23.640> a<00:01:23.670> book<00:01:23.880> or<00:01:24.119> two<00:01:24.299> or<00:01:24.450> three<00:01:24.479> of<00:01:24.720> them\\n\\n00:01:24.950 --> 00:01:24.960 align:start position:0%\\nblock or a book or two or three of them\\n \\n\\n00:01:24.960 --> 00:01:27.590 align:start position:0%\\nblock or a book or two or three of them\\nif<00:01:25.830> you're<00:01:26.009> a<00:01:26.070> girl<00:01:26.400> squatting<00:01:26.939> in<00:01:27.060> heels<00:01:27.299> it's\\n\\n00:01:27.590 --> 00:01:27.600 align:start position:0%\\nif you're a girl squatting in heels it's\\n \\n\\n00:01:27.600 --> 00:01:29.600 align:start position:0%\\nif you're a girl squatting in heels it's\\nso<00:01:27.869> much<00:01:28.049> easier<00:01:28.080> and<00:01:28.650> squatting<00:01:29.189> flat<00:01:29.460> on<00:01:29.549> the\\n\\n00:01:29.600 --> 00:01:29.610 align:start position:0%\\nso much easier and squatting flat on the\\n \\n\\n00:01:29.610 --> 00:01:31.490 align:start position:0%\\nso much easier and squatting flat on the\\nfloor<00:01:29.820> so<00:01:30.479> what<00:01:30.600> you<00:01:30.720> do<00:01:30.840> is<00:01:30.960> you<00:01:31.079> start<00:01:31.320> on\\n\\n00:01:31.490 --> 00:01:31.500 align:start position:0%\\nfloor so what you do is you start on\\n \\n\\n00:01:31.500 --> 00:01:33.319 align:start position:0%\\nfloor so what you do is you start on\\nwhatever<00:01:31.710> angle<00:01:32.220> you're<00:01:32.400> comfortable<00:01:32.970> in<00:01:33.090> and\\n\\n00:01:33.319 --> 00:01:33.329 align:start position:0%\\nwhatever angle you're comfortable in and\\n \\n\\n00:01:33.329 --> 00:01:36.260 align:start position:0%\\nwhatever angle you're comfortable in and\\nyou<00:01:34.110> work<00:01:34.350> your<00:01:34.560> way<00:01:34.710> down<00:01:34.740> to<00:01:35.549> becoming<00:01:36.000> flat\\n\\n00:01:36.260 --> 00:01:36.270 align:start position:0%\\nyou work your way down to becoming flat\\n \\n\\n00:01:36.270 --> 00:01:39.679 align:start position:0%\\nyou work your way down to becoming flat\\non<00:01:36.360> the<00:01:36.450> floor<00:01:36.659> so<00:01:37.409> really<00:01:37.740> a<00:01:37.770> squat<00:01:38.490> is<00:01:38.939> just<00:01:39.509> a\\n\\n00:01:39.679 --> 00:01:39.689 align:start position:0%\\non the floor so really a squat is just a\\n \\n\\n00:01:39.689 --> 00:01:42.469 align:start position:0%\\non the floor so really a squat is just a\\ntest<00:01:40.079> of<00:01:40.350> your<00:01:41.070> ankles<00:01:41.549> if<00:01:41.880> you've<00:01:42.090> got<00:01:42.299> good\\n\\n00:01:42.469 --> 00:01:42.479 align:start position:0%\\ntest of your ankles if you've got good\\n \\n\\n00:01:42.479 --> 00:01:45.499 align:start position:0%\\ntest of your ankles if you've got good\\nankles<00:01:42.869> you're<00:01:43.590> great<00:01:43.890> at<00:01:44.070> squatting<00:01:44.549> and<00:01:44.759> if\\n\\n00:01:45.499 --> 00:01:45.509 align:start position:0%\\nankles you're great at squatting and if\\n \\n\\n00:01:45.509 --> 00:01:47.420 align:start position:0%\\nankles you're great at squatting and if\\nyour<00:01:45.659> ankles<00:01:46.020> have<00:01:46.110> become<00:01:46.409> tight<00:01:46.680> then<00:01:47.009> squat\\n\\n00:01:47.420 --> 00:01:47.430 align:start position:0%\\nyour ankles have become tight then squat\\n \\n\\n00:01:47.430 --> 00:01:49.609 align:start position:0%\\nyour ankles have become tight then squat\\nis<00:01:47.549> really<00:01:47.820> difficult<00:01:48.060> and<00:01:48.450> hard<00:01:48.689> but<00:01:49.350> like\\n\\n00:01:49.609 --> 00:01:49.619 align:start position:0%\\nis really difficult and hard but like\\n \\n\\n00:01:49.619 --> 00:01:51.499 align:start position:0%\\nis really difficult and hard but like\\nevery<00:01:50.070> other<00:01:50.220> muscle<00:01:50.490> and<00:01:50.880> joint<00:01:51.180> in<00:01:51.390> the<00:01:51.479> body\\n\\n00:01:51.499 --> 00:01:51.509 align:start position:0%\\nevery other muscle and joint in the body\\n \\n\\n00:01:51.509 --> 00:01:54.080 align:start position:0%\\nevery other muscle and joint in the body\\nis<00:01:51.960> trainable<00:01:52.680> and<00:01:52.920> you've<00:01:53.399> got<00:01:53.549> time<00:01:53.820> to<00:01:53.970> get\\n\\n00:01:54.080 --> 00:01:54.090 align:start position:0%\\nis trainable and you've got time to get\\n \\n\\n00:01:54.090 --> 00:01:56.870 align:start position:0%\\nis trainable and you've got time to get\\nit<00:01:54.210> back<00:01:54.710> if<00:01:55.710> you<00:01:55.770> do<00:01:56.070> something<00:01:56.369> for<00:01:56.549> 10\\n\\n00:01:56.870 --> 00:01:56.880 align:start position:0%\\nit back if you do something for 10\\n \\n\\n00:01:56.880 --> 00:01:58.850 align:start position:0%\\nit back if you do something for 10\\nminutes<00:01:57.149> every<00:01:57.360> day<00:01:57.540> kind<00:01:58.259> of<00:01:58.350> like<00:01:58.560> brushing\\n\\n00:01:58.850 --> 00:01:58.860 align:start position:0%\\nminutes every day kind of like brushing\\n \\n\\n00:01:58.860 --> 00:02:01.160 align:start position:0%\\nminutes every day kind of like brushing\\nyour<00:01:59.070> teeth<00:01:59.280> it<00:02:00.000> gets<00:02:00.240> into<00:02:00.509> a<00:02:00.540> habit<00:02:00.930> and<00:02:01.110> you\\n\\n00:02:01.160 --> 00:02:01.170 align:start position:0%\\nyour teeth it gets into a habit and you\\n \\n\\n00:02:01.170 --> 00:02:02.719 align:start position:0%\\nyour teeth it gets into a habit and you\\ngo<00:02:01.320> you<00:02:01.380> get<00:02:01.619> up<00:02:01.740> you<00:02:01.890> brush<00:02:02.070> your<00:02:02.219> teeth<00:02:02.369> go<00:02:02.670> to\\n\\n00:02:02.719 --> 00:02:02.729 align:start position:0%\\ngo you get up you brush your teeth go to\\n \\n\\n00:02:02.729 --> 00:02:05.270 align:start position:0%\\ngo you get up you brush your teeth go to\\nveggie<00:02:03.000> brush<00:02:03.240> your<00:02:03.390> teeth<00:02:03.540> so<00:02:04.290> by<00:02:04.740> doing<00:02:04.799> 10\\n\\n00:02:05.270 --> 00:02:05.280 align:start position:0%\\nveggie brush your teeth so by doing 10\\n \\n\\n00:02:05.280 --> 00:02:06.080 align:start position:0%\\nveggie brush your teeth so by doing 10\\nminutes<00:02:05.520> every<00:02:05.820> day\\n\\n00:02:06.080 --> 00:02:06.090 align:start position:0%\\nminutes every day\\n \\n\\n00:02:06.090 --> 00:02:08.690 align:start position:0%\\nminutes every day\\nyou'll<00:02:06.810> get<00:02:06.990> into<00:02:07.109> the<00:02:07.259> habit<00:02:07.469> by<00:02:08.399> playing\\n\\n00:02:08.690 --> 00:02:08.700 align:start position:0%\\nyou'll get into the habit by playing\\n \\n\\n00:02:08.700 --> 00:02:10.759 align:start position:0%\\nyou'll get into the habit by playing\\nusing<00:02:09.420> just<00:02:09.750> that<00:02:09.899> short<00:02:10.140> of<00:02:10.289> a<00:02:10.379> short<00:02:10.590> amount\\n\\n00:02:10.759 --> 00:02:10.769 align:start position:0%\\nusing just that short of a short amount\\n \\n\\n00:02:10.769 --> 00:02:12.559 align:start position:0%\\nusing just that short of a short amount\\nof<00:02:10.920> time<00:02:11.129> you<00:02:11.310> can<00:02:11.340> really<00:02:11.700> focus<00:02:11.910> on<00:02:12.239> how<00:02:12.390> your\\n\\n00:02:12.559 --> 00:02:12.569 align:start position:0%\\nof time you can really focus on how your\\n \\n\\n00:02:12.569 --> 00:02:13.520 align:start position:0%\\nof time you can really focus on how your\\nbody's<00:02:12.870> anemic\\n\\n00:02:13.520 --> 00:02:13.530 align:start position:0%\\nbody's anemic\\n \\n\\n00:02:13.530 --> 00:02:15.050 align:start position:0%\\nbody's anemic\\nnow<00:02:14.010> some<00:02:14.220> of<00:02:14.280> the<00:02:14.340> exercises<00:02:14.790> that<00:02:14.940> I\\n\\n00:02:15.050 --> 00:02:15.060 align:start position:0%\\nnow some of the exercises that I\\n \\n\\n00:02:15.060 --> 00:02:16.699 align:start position:0%\\nnow some of the exercises that I\\nrecommend<00:02:15.540> that<00:02:15.599> you<00:02:15.780> do<00:02:15.989> within<00:02:16.290> those<00:02:16.440> 10\\n\\n00:02:16.699 --> 00:02:16.709 align:start position:0%\\nrecommend that you do within those 10\\n \\n\\n00:02:16.709 --> 00:02:20.270 align:start position:0%\\nrecommend that you do within those 10\\nminutes<00:02:16.860> first<00:02:17.849> one<00:02:18.120> is<00:02:18.330> the<00:02:18.720> squat<00:02:19.280> sitting\\n\\n00:02:20.270 --> 00:02:20.280 align:start position:0%\\nminutes first one is the squat sitting\\n \\n\\n00:02:20.280 --> 00:02:22.759 align:start position:0%\\nminutes first one is the squat sitting\\nin<00:02:20.489> a<00:02:20.580> squat<00:02:20.940> not<00:02:21.480> bobbing<00:02:22.019> up<00:02:22.080> and<00:02:22.230> down<00:02:22.349> just\\n\\n00:02:22.759 --> 00:02:22.769 align:start position:0%\\nin a squat not bobbing up and down just\\n \\n\\n00:02:22.769 --> 00:02:24.589 align:start position:0%\\nin a squat not bobbing up and down just\\nbeing<00:02:23.010> able<00:02:23.190> to<00:02:23.370> function<00:02:24.000> to<00:02:24.209> sit<00:02:24.420> in<00:02:24.540> the\\n\\n00:02:24.589 --> 00:02:24.599 align:start position:0%\\nbeing able to function to sit in the\\n \\n\\n00:02:24.599 --> 00:02:27.229 align:start position:0%\\nbeing able to function to sit in the\\nsquat<00:02:24.930> something<00:02:25.709> again<00:02:26.010> is<00:02:26.250> innate<00:02:26.400> to<00:02:26.819> us<00:02:26.849> as\\n\\n00:02:27.229 --> 00:02:27.239 align:start position:0%\\nsquat something again is innate to us as\\n \\n\\n00:02:27.239 --> 00:02:28.729 align:start position:0%\\nsquat something again is innate to us as\\nhuman<00:02:27.569> beings<00:02:27.599> and<00:02:28.049> something<00:02:28.440> that<00:02:28.530> you<00:02:28.650> are\\n\\n00:02:28.729 --> 00:02:28.739 align:start position:0%\\nhuman beings and something that you are\\n \\n\\n00:02:28.739 --> 00:02:31.670 align:start position:0%\\nhuman beings and something that you are\\nable<00:02:28.890> to<00:02:29.220> do<00:02:29.910> as<00:02:30.090> a<00:02:30.120> child<00:02:30.209> so<00:02:30.870> spend<00:02:31.440> 10\\n\\n00:02:31.670 --> 00:02:31.680 align:start position:0%\\nable to do as a child so spend 10\\n \\n\\n00:02:31.680 --> 00:02:34.640 align:start position:0%\\nable to do as a child so spend 10\\nminutes<00:02:32.239> every<00:02:33.239> day<00:02:33.450> getting<00:02:34.110> your<00:02:34.410> squat\\n\\n00:02:34.640 --> 00:02:34.650 align:start position:0%\\nminutes every day getting your squat\\n \\n\\n00:02:34.650 --> 00:02:37.360 align:start position:0%\\nminutes every day getting your squat\\nback\\n\\n00:02:37.360 --> 00:02:37.370 align:start position:0%\\n \\n \\n\\n00:02:37.370 --> 00:02:51.610 align:start position:0%\\n \\n[Music]\\n\\n00:02:51.610 --> 00:02:51.620 align:start position:0%\\n \\n \\n\\n00:02:51.620 --> 00:02:56.310 align:start position:0%\\n \\n[Music]\\n\\n\", transcript=\"00:00:02\\nhi I'm Roger Frampton and I'm a movement\\n00:00:05\\ncoach from London and I'm author of the\\n00:00:07\\nbook the flexible body so this position\\n00:00:10\\nis the squat\\n00:00:11\\nmost people when I talk about the squat\\n00:00:13\\nthinking when I'm forming up and down\\n00:00:14\\nexercise it works your legs and bar\\n00:00:16\\nwell the squat is actually a position\\n00:00:19\\nthat we're designed to defecate in every\\n00:00:22\\nkid sits and rests in this position and\\n00:00:25\\nif we look at Western people I think\\n00:00:28\\nmost people end up sitting up on their\\n00:00:31\\ntoes more in this position while I'm\\n00:00:33\\nadvising is that you get this position\\n00:00:35\\nback not for an exercise necessarily not\\n00:00:38\\nbecause of fitness or to be fitter but\\n00:00:40\\njust because you're designed to do it\\n00:00:43\\nthis here is just the human resting\\n00:00:45\\nposition so when I run classes people\\n00:00:48\\nalways talk about the squat how they\\n00:00:50\\ncan't quite get their heel down on the\\n00:00:51\\nfloor now the reason for that is because\\n00:00:53\\nsince about the age of four years old\\n00:00:55\\nyou've been wearing shoes we are an\\n00:00:58\\nanimal\\nwe're designed to walk around on bare\\n00:01:00\\nfeet and the reason that we'll lose our\\n00:01:02\\nsquat mobility is simply because we come\\n00:01:05\\nto become tight in our ankles and\\n00:01:07\\ntherefore you can't get all the way down\\n00:01:09\\nto the bottom now all you need to do is\\n00:01:11\\n00:01:12\\nif you put your heels on a yoga block or\\n00:01:14\\na book you're in that position rather\\n00:01:16\\nthan that position that will take away\\n00:01:19\\nyour ankle mobility and allow you to get\\n00:01:21\\nall the way down now you can use a yoga\\n00:01:23\\nblock or a book or two or three of them\\n00:01:24\\nif you're a girl squatting in heels it's\\n00:01:27\\nso much easier and squatting flat on the\\n00:01:29\\nfloor so what you do is you start on\\n00:01:31\\nwhatever angle you're comfortable in and\\n00:01:33\\nyou work your way down to becoming flat\\n00:01:36\\non the floor so really a squat is just a\\n00:01:39\\ntest of your ankles if you've got good\\n00:01:42\\nankles you're great at squatting and if\\n00:01:45\\nyour ankles have become tight then squat\\n00:01:47\\nis really difficult and hard but like\\n00:01:49\\nevery other muscle and joint in the body\\n00:01:51\\nis trainable and you've got time to get\\n00:01:54\\nit back if you do something for 10\\n00:01:56\\nminutes every day kind of like brushing\\n00:01:58\\nyour teeth it gets into a habit and you\\n00:02:01\\ngo you get up you brush your teeth go to\\n00:02:02\\nveggie brush your teeth so by doing 10\\n00:02:05\\nminutes every day\\n00:02:06\\nyou'll get into the habit by playing\\n00:02:08\\nusing just that short of a short amount\\n00:02:10\\nof time you can really focus on how your\\n00:02:12\\nbody's anemic\\n00:02:13\\nnow some of the exercises that I\\n00:02:15\\nrecommend that you do within those 10\\n00:02:16\\nminutes first one is the squat sitting\\n00:02:20\\nin a squat not bobbing up and down just\\n00:02:22\\nbeing able to function to sit in the\\n00:02:24\\nsquat something again is innate to us as\\n00:02:27\\nhuman beings and something that you are\\n00:02:28\\nable to do as a child so spend 10\\n00:02:31\\nminutes every day getting your squat\\n00:02:34\\nback\\n00:02:37\\n[Music]\\n00:02:51\")]}}\n", + "Frame idx 0\n", + "Frame idx 24\n", + "Frame idx 48\n", + "Frame idx 72\n", + "Frame idx 96\n", + "Frame idx 120\n", + "Frame idx 144\n", + "Frame idx 168\n", + "Frame idx 192\n", + "Frame idx 216\n", + "Frame idx 240\n", + "Frame idx 264\n", + "Frame idx 288\n", + "Frame idx 312\n", + "Frame idx 336\n", + "Frame idx 360\n", + "Frame idx 384\n", + "Frame idx 408\n", + "Frame idx 432\n", + "Frame idx 456\n", + "Frame idx 480\n", + "Frame idx 504\n", + "Frame idx 528\n", + "Frame idx 552\n", + "Frame idx 576\n", + "Frame idx 600\n", + "Init model complete\n", + "Frame idx 624\n", + "Frame idx 648\n", + "Frame idx 672\n", + "Frame idx 696\n", + "Frame idx 720\n", + "Frame idx 744\n", + "Frame idx 768\n", + "Frame idx 792\n", + "Frame idx 816\n", + "Frame idx 840\n", + "Frame idx 864\n", + "Frame idx 888\n", + "Frame idx 912\n", + "Frame idx 936\n", + "Frame idx 960\n", + "Frame idx 984\n", + "Frame idx 1008\n", + "Frame idx 1032\n", + "Frame idx 1056\n", + "Frame idx 1080\n", + "Frame idx 1104\n", + "Frame idx 1128\n", + "Frame idx 1152\n", + "Frame idx 0\n", + "Frame idx 25\n", + "Frame idx 50\n", + "Frame idx 75\n", + "Frame idx 100\n", + "Frame idx 125\n", + "Frame idx 150\n", + "Frame idx 175\n", + "Frame idx 200\n", + "Frame idx 225\n", + "Frame idx 250\n", + "Frame idx 275\n", + "Frame idx 300\n", + "Frame idx 325\n", + "Frame idx 350\n", + "Frame idx 375\n", + "Frame idx 400\n", + "Frame idx 425\n", + "Frame idx 450\n", + "Frame idx 475\n", + "Frame idx 500\n", + "Frame idx 525\n", + "Frame idx 550\n", + "Frame idx 575\n", + "Frame idx 600\n", + "Frame idx 625\n", + "Frame idx 650\n", + "Frame idx 675\n", + "Frame idx 700\n", + "Frame idx 725\n", + "Frame idx 750\n", + "Frame idx 775\n", + "Frame idx 800\n", + "Frame idx 825\n", + "Frame idx 850\n", + "Frame idx 875\n", + "Frame idx 900\n", + "Frame idx 925\n", + "Frame idx 950\n", + "Frame idx 975\n", + "Frame idx 1000\n", + "Frame idx 1025\n", + "Frame idx 1050\n", + "Frame idx 1075\n", + "Frame idx 1100\n", + "Frame idx 1125\n", + "Frame idx 1150\n", + "Frame idx 1175\n", + "Frame idx 1200\n", + "Frame idx 1225\n", + "Frame idx 1250\n", + "Frame idx 1275\n", + "Frame idx 1300\n", + "Frame idx 1325\n", + "Frame idx 1350\n", + "Frame idx 1375\n", + "Frame idx 1400\n", + "Frame idx 1425\n", + "Frame idx 1450\n", + "Frame idx 1475\n", + "Frame idx 1500\n", + "Frame idx 1525\n", + "Frame idx 1550\n", + "Frame idx 1575\n", + "Frame idx 1600\n", + "Frame idx 1625\n", + "Frame idx 1650\n", + "Frame idx 1675\n", + "Frame idx 1700\n", + "Frame idx 1725\n", + "Frame idx 1750\n", + "Frame idx 1775\n", + "Frame idx 1800\n", + "Frame idx 1825\n", + "Frame idx 1850\n", + "Frame idx 1875\n", + "Frame idx 1900\n", + "Frame idx 1925\n", + "Frame idx 1950\n", + "Frame idx 1975\n", + "Frame idx 2000\n", + "Frame idx 2025\n", + "Frame idx 2050\n", + "Frame idx 2075\n", + "Frame idx 2100\n", + "Frame idx 2125\n", + "Frame idx 2150\n", + "Frame idx 2175\n", + "Frame idx 2200\n", + "Frame idx 2225\n", + "Frame idx 2250\n", + "Frame idx 2275\n", + "Frame idx 2300\n", + "Frame idx 2325\n", + "Frame idx 2350\n", + "Frame idx 2375\n", + "Frame idx 2400\n", + "Frame idx 2425\n", + "Frame idx 2450\n", + "Frame idx 2475\n", + "Frame idx 2500\n", + "Frame idx 2525\n", + "Frame idx 2550\n", + "Frame idx 2575\n", + "Frame idx 2600\n", + "Frame idx 2625\n", + "Frame idx 2650\n", + "Frame idx 2675\n", + "Frame idx 2700\n", + "Frame idx 2725\n", + "Frame idx 2750\n", + "Frame idx 2775\n", + "Frame idx 2800\n", + "Frame idx 2825\n", + "Frame idx 2850\n", + "Frame idx 2875\n", + "Frame idx 2900\n", + "Frame idx 2925\n", + "Frame idx 2950\n", + "Frame idx 2975\n", + "Frame idx 3000\n", + "Frame idx 3025\n", + "Frame idx 3050\n", + "Frame idx 3075\n", + "Frame idx 3100\n", + "Frame idx 3125\n", + "Frame idx 3150\n", + "Frame idx 3175\n", + "Frame idx 3200\n", + "Frame idx 3225\n", + "Frame idx 3250\n", + "Frame idx 3275\n", + "Frame idx 3300\n", + "Frame idx 3325\n", + "Frame idx 3350\n", + "Frame idx 3375\n", + "Frame idx 3400\n", + "Frame idx 3425\n", + "Frame idx 3450\n", + "Frame idx 3475\n", + "Frame idx 3500\n", + "Frame idx 3525\n", + "Frame idx 3550\n", + "Frame idx 3575\n", + "Frame idx 3600\n", + "Frame idx 3625\n", + "Frame idx 3650\n", + "Frame idx 3675\n", + "Frame idx 3700\n", + "Frame idx 3725\n", + "Frame idx 3750\n", + "Frame idx 3775\n", + "Frame idx 3800\n", + "Frame idx 3825\n", + "Frame idx 3850\n", + "Frame idx 3875\n", + "Frame idx 3900\n", + "Frame idx 3925\n", + "Frame idx 3950\n", + "Frame idx 3975\n", + "Frame idx 4000\n", + "Frame idx 4025\n", + "Frame idx 4050\n", + "Frame idx 4075\n", + "Frame idx 4100\n", + "Frame idx 4125\n", + "Frame idx 4150\n", + "Frame idx 4175\n", + "Frame idx 4200\n", + "Frame idx 4225\n", + "Frame idx 4250\n", + "Frame idx 4275\n", + "Frame idx 4300\n", + "Frame idx 4325\n", + "Frame idx 4350\n", + "Frame idx 4375\n", + "All frames processed\n", + "Dataframe created\n", + " video_id frame_idx probs\n", + "0 xqvCmoLULNY 0 2.199925e-08\n", + "1 xqvCmoLULNY 24 1.503990e-01\n", + "2 xqvCmoLULNY 48 1.242190e-01\n", + "3 xqvCmoLULNY 72 1.302760e-01\n", + "4 xqvCmoLULNY 96 1.310861e-01\n", + ".. ... ... ...\n", + "220 IB_icWRzi4E 4275 2.498681e-07\n", + "221 IB_icWRzi4E 4300 3.288528e-07\n", + "222 IB_icWRzi4E 4325 3.445720e-07\n", + "223 IB_icWRzi4E 4350 3.333991e-07\n", + "224 IB_icWRzi4E 4375 2.660451e-07\n", + "\n", + "[225 rows x 3 columns]\n", + "Segments for video IB_icWRzi4E: [(0, 5), (9, 24), (29, 45), (49, 53), (62, 66), (103, 109), (138, 147)]\n", + "Segments for video xqvCmoLULNY: [(1, 44)]\n", + "{'detect_segments': {'segment_infos': [SegmentInfo(start_timestamp='00:00:00.000', end_timestamp='00:00:05.000', fps=25.0, video_id='IB_icWRzi4E'), SegmentInfo(start_timestamp='00:00:09.000', end_timestamp='00:00:24.000', fps=25.0, video_id='IB_icWRzi4E'), SegmentInfo(start_timestamp='00:00:29.000', end_timestamp='00:00:45.000', fps=25.0, video_id='IB_icWRzi4E'), SegmentInfo(start_timestamp='00:00:49.000', end_timestamp='00:00:53.000', fps=25.0, video_id='IB_icWRzi4E'), SegmentInfo(start_timestamp='00:01:02.000', end_timestamp='00:01:06.000', fps=25.0, video_id='IB_icWRzi4E'), SegmentInfo(start_timestamp='00:01:43.000', end_timestamp='00:01:49.000', fps=25.0, video_id='IB_icWRzi4E'), SegmentInfo(start_timestamp='00:02:18.000', end_timestamp='00:02:27.000', fps=25.0, video_id='IB_icWRzi4E'), SegmentInfo(start_timestamp='00:00:01.000', end_timestamp='00:00:44.000', fps=25.0, video_id='xqvCmoLULNY')]}}\n" ] } ], @@ -721,12 +1007,60 @@ "for s in graph.stream(\n", " {\n", " \"task\": \"i wanna teach people how to do squats\",\n", + " \"clip_text_prompts\": [\"person doing squats\"],\n", " },\n", " thread,\n", "):\n", " print(s)" ] }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'task': 'i wanna teach people how to do squats',\n", + " 'search_queries': ['how to do squats', 'squat exercise tutorial'],\n", + " 'video_ids': ['xqvCmoLULNY', 'IB_icWRzi4E'],\n", + " 'video_infos': [VideoInfo(video_id='xqvCmoLULNY', url='https://www.youtube.com/watch?v=xqvCmoLULNY', relative_video_path='videos/xqvCmoLULNY.mp4', subs=\"WEBVTT\\nKind: captions\\nLanguage: en\\n\\n00:00:00.160 --> 00:00:01.829 align:start position:0%\\n \\nlet's<00:00:00.399> learn<00:00:00.560> how<00:00:00.719> to<00:00:00.880> properly<00:00:01.280> perform<00:00:01.760> a\\n\\n00:00:01.829 --> 00:00:01.839 align:start position:0%\\nlet's learn how to properly perform a\\n \\n\\n00:00:01.839 --> 00:00:02.790 align:start position:0%\\nlet's learn how to properly perform a\\nsquat\\n\\n00:00:02.790 --> 00:00:02.800 align:start position:0%\\nsquat\\n \\n\\n00:00:02.800 --> 00:00:04.470 align:start position:0%\\nsquat\\nstart<00:00:03.120> with<00:00:03.199> your<00:00:03.360> feet<00:00:03.679> slightly<00:00:04.080> wider<00:00:04.319> than\\n\\n00:00:04.470 --> 00:00:04.480 align:start position:0%\\nstart with your feet slightly wider than\\n \\n\\n00:00:04.480 --> 00:00:06.389 align:start position:0%\\nstart with your feet slightly wider than\\nshoulder<00:00:04.799> width<00:00:05.120> apart\\n\\n00:00:06.389 --> 00:00:06.399 align:start position:0%\\nshoulder width apart\\n \\n\\n00:00:06.399 --> 00:00:09.190 align:start position:0%\\nshoulder width apart\\ncross<00:00:06.799> your<00:00:07.040> arms<00:00:07.440> in<00:00:07.759> front\\n\\n00:00:09.190 --> 00:00:09.200 align:start position:0%\\ncross your arms in front\\n \\n\\n00:00:09.200 --> 00:00:11.270 align:start position:0%\\ncross your arms in front\\nso<00:00:09.440> touch<00:00:09.679> your<00:00:09.920> right<00:00:10.240> hand<00:00:10.559> to<00:00:10.719> your<00:00:10.960> left\\n\\n00:00:11.270 --> 00:00:11.280 align:start position:0%\\nso touch your right hand to your left\\n \\n\\n00:00:11.280 --> 00:00:13.350 align:start position:0%\\nso touch your right hand to your left\\nshoulder<00:00:12.080> and<00:00:12.320> vice<00:00:12.559> versa<00:00:13.040> pointing<00:00:13.280> your\\n\\n00:00:13.350 --> 00:00:13.360 align:start position:0%\\nshoulder and vice versa pointing your\\n \\n\\n00:00:13.360 --> 00:00:15.190 align:start position:0%\\nshoulder and vice versa pointing your\\nelbows<00:00:13.679> straight<00:00:13.920> ahead<00:00:14.559> now<00:00:14.719> from<00:00:14.960> here\\n\\n00:00:15.190 --> 00:00:15.200 align:start position:0%\\nelbows straight ahead now from here\\n \\n\\n00:00:15.200 --> 00:00:17.109 align:start position:0%\\nelbows straight ahead now from here\\nshift<00:00:15.440> your<00:00:15.679> weight<00:00:16.160> to<00:00:16.320> the<00:00:16.480> ball<00:00:16.720> of<00:00:16.880> your\\n\\n00:00:17.109 --> 00:00:17.119 align:start position:0%\\nshift your weight to the ball of your\\n \\n\\n00:00:17.119 --> 00:00:18.230 align:start position:0%\\nshift your weight to the ball of your\\nfeet\\n\\n00:00:18.230 --> 00:00:18.240 align:start position:0%\\nfeet\\n \\n\\n00:00:18.240 --> 00:00:21.109 align:start position:0%\\nfeet\\nand<00:00:18.400> bend<00:00:18.720> your<00:00:18.880> knees\\n\\n00:00:21.109 --> 00:00:21.119 align:start position:0%\\nand bend your knees\\n \\n\\n00:00:21.119 --> 00:00:23.189 align:start position:0%\\nand bend your knees\\nget<00:00:21.359> as<00:00:21.520> close<00:00:21.680> to<00:00:21.840> 90<00:00:22.160> degrees<00:00:22.480> as<00:00:22.640> you<00:00:22.800> can\\n\\n00:00:23.189 --> 00:00:23.199 align:start position:0%\\nget as close to 90 degrees as you can\\n \\n\\n00:00:23.199 --> 00:00:25.830 align:start position:0%\\nget as close to 90 degrees as you can\\nlooking<00:00:23.519> straight<00:00:23.840> ahead<00:00:24.480> and<00:00:24.720> from<00:00:24.960> here\\n\\n00:00:25.830 --> 00:00:25.840 align:start position:0%\\nlooking straight ahead and from here\\n \\n\\n00:00:25.840 --> 00:00:29.109 align:start position:0%\\nlooking straight ahead and from here\\npush<00:00:26.160> back<00:00:26.400> up<00:00:26.480> to<00:00:26.640> the<00:00:26.720> starting<00:00:27.039> position\\n\\n00:00:29.109 --> 00:00:29.119 align:start position:0%\\npush back up to the starting position\\n \\n\\n00:00:29.119 --> 00:00:30.150 align:start position:0%\\npush back up to the starting position\\nthis<00:00:29.359> is<00:00:29.439> going<00:00:29.599> to<00:00:29.679> be<00:00:29.760> great<00:00:30.000> for\\n\\n00:00:30.150 --> 00:00:30.160 align:start position:0%\\nthis is going to be great for\\n \\n\\n00:00:30.160 --> 00:00:31.830 align:start position:0%\\nthis is going to be great for\\nstrengthening\\n\\n00:00:31.830 --> 00:00:31.840 align:start position:0%\\nstrengthening\\n \\n\\n00:00:31.840 --> 00:00:34.069 align:start position:0%\\nstrengthening\\nyour<00:00:32.079> thighs<00:00:32.480> or<00:00:32.559> your<00:00:32.719> quadriceps<00:00:33.680> as<00:00:33.840> well\\n\\n00:00:34.069 --> 00:00:34.079 align:start position:0%\\nyour thighs or your quadriceps as well\\n \\n\\n00:00:34.079 --> 00:00:36.150 align:start position:0%\\nyour thighs or your quadriceps as well\\nas<00:00:34.239> your<00:00:34.480> butt<00:00:34.719> or<00:00:34.800> your<00:00:34.960> glutes\\n\\n00:00:36.150 --> 00:00:36.160 align:start position:0%\\nas your butt or your glutes\\n \\n\\n00:00:36.160 --> 00:00:37.750 align:start position:0%\\nas your butt or your glutes\\nand<00:00:36.399> it's<00:00:36.559> also<00:00:36.800> going<00:00:36.800> to<00:00:36.880> be<00:00:37.040> great<00:00:37.280> to<00:00:37.440> work\\n\\n00:00:37.750 --> 00:00:37.760 align:start position:0%\\nand it's also going to be great to work\\n \\n\\n00:00:37.760 --> 00:00:41.510 align:start position:0%\\nand it's also going to be great to work\\non<00:00:38.079> your<00:00:38.320> posture\\n\\n00:00:41.510 --> 00:00:41.520 align:start position:0%\\n \\n \\n\\n00:00:41.520 --> 00:00:45.840 align:start position:0%\\n \\nthat's<00:00:41.760> how<00:00:41.920> to<00:00:42.000> properly<00:00:42.399> perform<00:00:42.879> a<00:00:42.960> squat\\n\\n\", transcript=\"00:00:00\\nlet's learn how to properly perform a\\n00:00:01\\nsquat\\n00:00:02\\nstart with your feet slightly wider than\\n00:00:04\\nshoulder width apart\\n00:00:06\\ncross your arms in front\\n00:00:09\\nso touch your right hand to your left\\n00:00:11\\nshoulder and vice versa pointing your\\n00:00:13\\nelbows straight ahead now from here\\n00:00:15\\nshift your weight to the ball of your\\n00:00:17\\nfeet\\n00:00:18\\nand bend your knees\\n00:00:21\\nget as close to 90 degrees as you can\\n00:00:23\\nlooking straight ahead and from here\\n00:00:25\\npush back up to the starting position\\n00:00:29\\nthis is going to be great for\\n00:00:30\\nstrengthening\\n00:00:31\\nyour thighs or your quadriceps as well\\n00:00:34\\nas your butt or your glutes\\n00:00:36\\nand it's also going to be great to work\\n00:00:37\\non your posture\\n00:00:41\\nthat's how to properly perform a squat\"),\n", + " VideoInfo(video_id='IB_icWRzi4E', url='https://www.youtube.com/watch?v=IB_icWRzi4E', relative_video_path='videos/IB_icWRzi4E.mp4', subs=\"WEBVTT\\nKind: captions\\nLanguage: en\\n\\n00:00:02.419 --> 00:00:05.150 align:start position:0%\\n \\nhi<00:00:03.419> I'm<00:00:03.810> Roger<00:00:03.990> Frampton<00:00:04.319> and<00:00:04.770> I'm<00:00:04.859> a<00:00:04.950> movement\\n\\n00:00:05.150 --> 00:00:05.160 align:start position:0%\\nhi I'm Roger Frampton and I'm a movement\\n \\n\\n00:00:05.160 --> 00:00:07.400 align:start position:0%\\nhi I'm Roger Frampton and I'm a movement\\ncoach<00:00:05.520> from<00:00:05.910> London<00:00:06.299> and<00:00:06.509> I'm<00:00:06.779> author<00:00:07.049> of<00:00:07.230> the\\n\\n00:00:07.400 --> 00:00:07.410 align:start position:0%\\ncoach from London and I'm author of the\\n \\n\\n00:00:07.410 --> 00:00:10.310 align:start position:0%\\ncoach from London and I'm author of the\\nbook<00:00:07.649> the<00:00:08.010> flexible<00:00:08.400> body<00:00:08.660> so<00:00:09.660> this<00:00:09.840> position\\n\\n00:00:10.310 --> 00:00:10.320 align:start position:0%\\nbook the flexible body so this position\\n \\n\\n00:00:10.320 --> 00:00:11.780 align:start position:0%\\nbook the flexible body so this position\\nis<00:00:10.860> the<00:00:11.099> squat\\n\\n00:00:11.780 --> 00:00:11.790 align:start position:0%\\nis the squat\\n \\n\\n00:00:11.790 --> 00:00:13.459 align:start position:0%\\nis the squat\\nmost<00:00:12.150> people<00:00:12.480> when<00:00:12.599> I<00:00:12.660> talk<00:00:12.870> about<00:00:12.990> the<00:00:13.200> squat\\n\\n00:00:13.459 --> 00:00:13.469 align:start position:0%\\nmost people when I talk about the squat\\n \\n\\n00:00:13.469 --> 00:00:14.780 align:start position:0%\\nmost people when I talk about the squat\\nthinking<00:00:13.799> when<00:00:13.920> I'm<00:00:14.009> forming<00:00:14.340> up<00:00:14.460> and<00:00:14.639> down\\n\\n00:00:14.780 --> 00:00:14.790 align:start position:0%\\nthinking when I'm forming up and down\\n \\n\\n00:00:14.790 --> 00:00:16.790 align:start position:0%\\nthinking when I'm forming up and down\\nexercise<00:00:15.299> it<00:00:15.509> works<00:00:15.750> your<00:00:15.960> legs<00:00:16.080> and<00:00:16.350> bar\\n\\n00:00:16.790 --> 00:00:16.800 align:start position:0%\\nexercise it works your legs and bar\\n \\n\\n00:00:16.800 --> 00:00:19.670 align:start position:0%\\nexercise it works your legs and bar\\nwell<00:00:17.640> the<00:00:17.789> squat<00:00:18.090> is<00:00:18.210> actually<00:00:18.570> a<00:00:18.680> position\\n\\n00:00:19.670 --> 00:00:19.680 align:start position:0%\\nwell the squat is actually a position\\n \\n\\n00:00:19.680 --> 00:00:22.609 align:start position:0%\\nwell the squat is actually a position\\nthat<00:00:20.520> we're<00:00:20.730> designed<00:00:21.029> to<00:00:21.180> defecate<00:00:21.660> in<00:00:21.689> every\\n\\n00:00:22.609 --> 00:00:22.619 align:start position:0%\\nthat we're designed to defecate in every\\n \\n\\n00:00:22.619 --> 00:00:25.810 align:start position:0%\\nthat we're designed to defecate in every\\nkid<00:00:22.830> sits<00:00:23.670> and<00:00:24.000> rests<00:00:24.600> in<00:00:24.900> this<00:00:25.140> position<00:00:25.560> and\\n\\n00:00:25.810 --> 00:00:25.820 align:start position:0%\\nkid sits and rests in this position and\\n \\n\\n00:00:25.820 --> 00:00:28.700 align:start position:0%\\nkid sits and rests in this position and\\nif<00:00:26.820> we<00:00:26.910> look<00:00:27.119> at<00:00:27.240> Western<00:00:27.480> people<00:00:28.050> I<00:00:28.230> think\\n\\n00:00:28.700 --> 00:00:28.710 align:start position:0%\\nif we look at Western people I think\\n \\n\\n00:00:28.710 --> 00:00:31.550 align:start position:0%\\nif we look at Western people I think\\nmost<00:00:28.890> people<00:00:29.400> end<00:00:30.029> up<00:00:30.240> sitting<00:00:30.510> up<00:00:31.199> on<00:00:31.410> their\\n\\n00:00:31.550 --> 00:00:31.560 align:start position:0%\\nmost people end up sitting up on their\\n \\n\\n00:00:31.560 --> 00:00:33.740 align:start position:0%\\nmost people end up sitting up on their\\ntoes<00:00:31.740> more<00:00:32.070> in<00:00:32.219> this<00:00:32.309> position<00:00:32.759> while<00:00:33.660> I'm\\n\\n00:00:33.740 --> 00:00:33.750 align:start position:0%\\ntoes more in this position while I'm\\n \\n\\n00:00:33.750 --> 00:00:35.510 align:start position:0%\\ntoes more in this position while I'm\\nadvising<00:00:34.290> is<00:00:34.410> that<00:00:34.530> you<00:00:34.649> get<00:00:34.890> this<00:00:35.070> position\\n\\n00:00:35.510 --> 00:00:35.520 align:start position:0%\\nadvising is that you get this position\\n \\n\\n00:00:35.520 --> 00:00:38.150 align:start position:0%\\nadvising is that you get this position\\nback<00:00:35.730> not<00:00:36.510> for<00:00:36.719> an<00:00:36.809> exercise<00:00:37.260> necessarily<00:00:38.010> not\\n\\n00:00:38.150 --> 00:00:38.160 align:start position:0%\\nback not for an exercise necessarily not\\n \\n\\n00:00:38.160 --> 00:00:40.610 align:start position:0%\\nback not for an exercise necessarily not\\nbecause<00:00:38.489> of<00:00:38.520> fitness<00:00:38.969> or<00:00:39.149> to<00:00:39.270> be<00:00:39.420> fitter<00:00:39.719> but\\n\\n00:00:40.610 --> 00:00:40.620 align:start position:0%\\nbecause of fitness or to be fitter but\\n \\n\\n00:00:40.620 --> 00:00:43.430 align:start position:0%\\nbecause of fitness or to be fitter but\\njust<00:00:40.920> because<00:00:41.520> you're<00:00:41.730> designed<00:00:42.120> to<00:00:43.110> do<00:00:43.260> it\\n\\n00:00:43.430 --> 00:00:43.440 align:start position:0%\\njust because you're designed to do it\\n \\n\\n00:00:43.440 --> 00:00:45.350 align:start position:0%\\njust because you're designed to do it\\nthis<00:00:43.710> here<00:00:43.980> is<00:00:44.219> just<00:00:44.399> the<00:00:44.610> human<00:00:44.789> resting\\n\\n00:00:45.350 --> 00:00:45.360 align:start position:0%\\nthis here is just the human resting\\n \\n\\n00:00:45.360 --> 00:00:48.229 align:start position:0%\\nthis here is just the human resting\\nposition<00:00:45.780> so<00:00:46.230> when<00:00:46.379> I<00:00:46.410> run<00:00:46.649> classes<00:00:47.250> people\\n\\n00:00:48.229 --> 00:00:48.239 align:start position:0%\\nposition so when I run classes people\\n \\n\\n00:00:48.239 --> 00:00:50.119 align:start position:0%\\nposition so when I run classes people\\nalways<00:00:48.480> talk<00:00:49.140> about<00:00:49.170> the<00:00:49.469> squat<00:00:49.770> how<00:00:49.980> they\\n\\n00:00:50.119 --> 00:00:50.129 align:start position:0%\\nalways talk about the squat how they\\n \\n\\n00:00:50.129 --> 00:00:51.590 align:start position:0%\\nalways talk about the squat how they\\ncan't<00:00:50.399> quite<00:00:50.610> get<00:00:50.789> their<00:00:50.969> heel<00:00:51.149> down<00:00:51.390> on<00:00:51.510> the\\n\\n00:00:51.590 --> 00:00:51.600 align:start position:0%\\ncan't quite get their heel down on the\\n \\n\\n00:00:51.600 --> 00:00:53.569 align:start position:0%\\ncan't quite get their heel down on the\\nfloor<00:00:51.840> now<00:00:52.649> the<00:00:52.739> reason<00:00:53.070> for<00:00:53.250> that<00:00:53.309> is<00:00:53.550> because\\n\\n00:00:53.569 --> 00:00:53.579 align:start position:0%\\nfloor now the reason for that is because\\n \\n\\n00:00:53.579 --> 00:00:55.220 align:start position:0%\\nfloor now the reason for that is because\\nsince<00:00:54.059> about<00:00:54.239> the<00:00:54.300> age<00:00:54.420> of<00:00:54.600> four<00:00:54.840> years<00:00:55.050> old\\n\\n00:00:55.220 --> 00:00:55.230 align:start position:0%\\nsince about the age of four years old\\n \\n\\n00:00:55.230 --> 00:00:58.099 align:start position:0%\\nsince about the age of four years old\\nyou've<00:00:55.800> been<00:00:56.010> wearing<00:00:56.219> shoes<00:00:56.809> we<00:00:57.809> are<00:00:57.960> an\\n\\n00:00:58.099 --> 00:00:58.109 align:start position:0%\\nyou've been wearing shoes we are an\\n \\n\\n00:00:58.109 --> 00:00:58.549 align:start position:0%\\nyou've been wearing shoes we are an\\nanimal\\n\\n00:00:58.549 --> 00:00:58.559 align:start position:0%\\nanimal\\n \\n\\n00:00:58.559 --> 00:01:00.229 align:start position:0%\\nanimal\\nwe're<00:00:59.070> designed<00:00:59.370> to<00:00:59.460> walk<00:00:59.640> around<00:00:59.789> on<00:01:00.059> bare\\n\\n00:01:00.229 --> 00:01:00.239 align:start position:0%\\nwe're designed to walk around on bare\\n \\n\\n00:01:00.239 --> 00:01:02.510 align:start position:0%\\nwe're designed to walk around on bare\\nfeet<00:01:00.270> and<00:01:00.690> the<00:01:01.109> reason<00:01:01.469> that<00:01:01.559> we'll<00:01:01.710> lose<00:01:01.949> our\\n\\n00:01:02.510 --> 00:01:02.520 align:start position:0%\\nfeet and the reason that we'll lose our\\n \\n\\n00:01:02.520 --> 00:01:05.210 align:start position:0%\\nfeet and the reason that we'll lose our\\nsquat<00:01:03.090> mobility<00:01:03.570> is<00:01:04.229> simply<00:01:04.619> because<00:01:04.920> we<00:01:05.040> come\\n\\n00:01:05.210 --> 00:01:05.220 align:start position:0%\\nsquat mobility is simply because we come\\n \\n\\n00:01:05.220 --> 00:01:07.370 align:start position:0%\\nsquat mobility is simply because we come\\nto<00:01:05.369> become<00:01:05.610> tight<00:01:05.970> in<00:01:06.150> our<00:01:06.210> ankles<00:01:06.659> and\\n\\n00:01:07.370 --> 00:01:07.380 align:start position:0%\\nto become tight in our ankles and\\n \\n\\n00:01:07.380 --> 00:01:09.590 align:start position:0%\\nto become tight in our ankles and\\ntherefore<00:01:08.070> you<00:01:08.670> can't<00:01:08.970> get<00:01:09.090> all<00:01:09.210> the<00:01:09.270> way<00:01:09.360> down\\n\\n00:01:09.590 --> 00:01:09.600 align:start position:0%\\ntherefore you can't get all the way down\\n \\n\\n00:01:09.600 --> 00:01:11.990 align:start position:0%\\ntherefore you can't get all the way down\\nto<00:01:09.780> the<00:01:09.869> bottom<00:01:10.010> now<00:01:11.010> all<00:01:11.369> you<00:01:11.490> need<00:01:11.640> to<00:01:11.729> do<00:01:11.880> is\\n\\n00:01:11.990 --> 00:01:12.000 align:start position:0%\\nto the bottom now all you need to do is\\n \\n\\n00:01:12.000 --> 00:01:14.030 align:start position:0%\\nto the bottom now all you need to do is\\nif<00:01:12.240> you<00:01:12.390> put<00:01:12.570> your<00:01:12.600> heels<00:01:12.990> on<00:01:13.200> a<00:01:13.229> yoga<00:01:13.590> block<00:01:13.799> or\\n\\n00:01:14.030 --> 00:01:14.040 align:start position:0%\\nif you put your heels on a yoga block or\\n \\n\\n00:01:14.040 --> 00:01:16.580 align:start position:0%\\nif you put your heels on a yoga block or\\na<00:01:14.100> book<00:01:14.400> you're<00:01:15.000> in<00:01:15.119> that<00:01:15.270> position<00:01:15.590> rather\\n\\n00:01:16.580 --> 00:01:16.590 align:start position:0%\\na book you're in that position rather\\n \\n\\n00:01:16.590 --> 00:01:19.070 align:start position:0%\\na book you're in that position rather\\nthan<00:01:16.799> that<00:01:16.890> position<00:01:17.430> that<00:01:18.330> will<00:01:18.600> take<00:01:18.900> away\\n\\n00:01:19.070 --> 00:01:19.080 align:start position:0%\\nthan that position that will take away\\n \\n\\n00:01:19.080 --> 00:01:21.050 align:start position:0%\\nthan that position that will take away\\nyour<00:01:19.320> ankle<00:01:19.680> mobility<00:01:19.770> and<00:01:20.280> allow<00:01:20.549> you<00:01:20.610> to<00:01:20.909> get\\n\\n00:01:21.050 --> 00:01:21.060 align:start position:0%\\nyour ankle mobility and allow you to get\\n \\n\\n00:01:21.060 --> 00:01:23.149 align:start position:0%\\nyour ankle mobility and allow you to get\\nall<00:01:21.210> the<00:01:21.330> way<00:01:21.450> down<00:01:21.689> now<00:01:22.500> you<00:01:22.560> can<00:01:22.770> use<00:01:22.920> a<00:01:22.950> yoga\\n\\n00:01:23.149 --> 00:01:23.159 align:start position:0%\\nall the way down now you can use a yoga\\n \\n\\n00:01:23.159 --> 00:01:24.950 align:start position:0%\\nall the way down now you can use a yoga\\nblock<00:01:23.490> or<00:01:23.640> a<00:01:23.670> book<00:01:23.880> or<00:01:24.119> two<00:01:24.299> or<00:01:24.450> three<00:01:24.479> of<00:01:24.720> them\\n\\n00:01:24.950 --> 00:01:24.960 align:start position:0%\\nblock or a book or two or three of them\\n \\n\\n00:01:24.960 --> 00:01:27.590 align:start position:0%\\nblock or a book or two or three of them\\nif<00:01:25.830> you're<00:01:26.009> a<00:01:26.070> girl<00:01:26.400> squatting<00:01:26.939> in<00:01:27.060> heels<00:01:27.299> it's\\n\\n00:01:27.590 --> 00:01:27.600 align:start position:0%\\nif you're a girl squatting in heels it's\\n \\n\\n00:01:27.600 --> 00:01:29.600 align:start position:0%\\nif you're a girl squatting in heels it's\\nso<00:01:27.869> much<00:01:28.049> easier<00:01:28.080> and<00:01:28.650> squatting<00:01:29.189> flat<00:01:29.460> on<00:01:29.549> the\\n\\n00:01:29.600 --> 00:01:29.610 align:start position:0%\\nso much easier and squatting flat on the\\n \\n\\n00:01:29.610 --> 00:01:31.490 align:start position:0%\\nso much easier and squatting flat on the\\nfloor<00:01:29.820> so<00:01:30.479> what<00:01:30.600> you<00:01:30.720> do<00:01:30.840> is<00:01:30.960> you<00:01:31.079> start<00:01:31.320> on\\n\\n00:01:31.490 --> 00:01:31.500 align:start position:0%\\nfloor so what you do is you start on\\n \\n\\n00:01:31.500 --> 00:01:33.319 align:start position:0%\\nfloor so what you do is you start on\\nwhatever<00:01:31.710> angle<00:01:32.220> you're<00:01:32.400> comfortable<00:01:32.970> in<00:01:33.090> and\\n\\n00:01:33.319 --> 00:01:33.329 align:start position:0%\\nwhatever angle you're comfortable in and\\n \\n\\n00:01:33.329 --> 00:01:36.260 align:start position:0%\\nwhatever angle you're comfortable in and\\nyou<00:01:34.110> work<00:01:34.350> your<00:01:34.560> way<00:01:34.710> down<00:01:34.740> to<00:01:35.549> becoming<00:01:36.000> flat\\n\\n00:01:36.260 --> 00:01:36.270 align:start position:0%\\nyou work your way down to becoming flat\\n \\n\\n00:01:36.270 --> 00:01:39.679 align:start position:0%\\nyou work your way down to becoming flat\\non<00:01:36.360> the<00:01:36.450> floor<00:01:36.659> so<00:01:37.409> really<00:01:37.740> a<00:01:37.770> squat<00:01:38.490> is<00:01:38.939> just<00:01:39.509> a\\n\\n00:01:39.679 --> 00:01:39.689 align:start position:0%\\non the floor so really a squat is just a\\n \\n\\n00:01:39.689 --> 00:01:42.469 align:start position:0%\\non the floor so really a squat is just a\\ntest<00:01:40.079> of<00:01:40.350> your<00:01:41.070> ankles<00:01:41.549> if<00:01:41.880> you've<00:01:42.090> got<00:01:42.299> good\\n\\n00:01:42.469 --> 00:01:42.479 align:start position:0%\\ntest of your ankles if you've got good\\n \\n\\n00:01:42.479 --> 00:01:45.499 align:start position:0%\\ntest of your ankles if you've got good\\nankles<00:01:42.869> you're<00:01:43.590> great<00:01:43.890> at<00:01:44.070> squatting<00:01:44.549> and<00:01:44.759> if\\n\\n00:01:45.499 --> 00:01:45.509 align:start position:0%\\nankles you're great at squatting and if\\n \\n\\n00:01:45.509 --> 00:01:47.420 align:start position:0%\\nankles you're great at squatting and if\\nyour<00:01:45.659> ankles<00:01:46.020> have<00:01:46.110> become<00:01:46.409> tight<00:01:46.680> then<00:01:47.009> squat\\n\\n00:01:47.420 --> 00:01:47.430 align:start position:0%\\nyour ankles have become tight then squat\\n \\n\\n00:01:47.430 --> 00:01:49.609 align:start position:0%\\nyour ankles have become tight then squat\\nis<00:01:47.549> really<00:01:47.820> difficult<00:01:48.060> and<00:01:48.450> hard<00:01:48.689> but<00:01:49.350> like\\n\\n00:01:49.609 --> 00:01:49.619 align:start position:0%\\nis really difficult and hard but like\\n \\n\\n00:01:49.619 --> 00:01:51.499 align:start position:0%\\nis really difficult and hard but like\\nevery<00:01:50.070> other<00:01:50.220> muscle<00:01:50.490> and<00:01:50.880> joint<00:01:51.180> in<00:01:51.390> the<00:01:51.479> body\\n\\n00:01:51.499 --> 00:01:51.509 align:start position:0%\\nevery other muscle and joint in the body\\n \\n\\n00:01:51.509 --> 00:01:54.080 align:start position:0%\\nevery other muscle and joint in the body\\nis<00:01:51.960> trainable<00:01:52.680> and<00:01:52.920> you've<00:01:53.399> got<00:01:53.549> time<00:01:53.820> to<00:01:53.970> get\\n\\n00:01:54.080 --> 00:01:54.090 align:start position:0%\\nis trainable and you've got time to get\\n \\n\\n00:01:54.090 --> 00:01:56.870 align:start position:0%\\nis trainable and you've got time to get\\nit<00:01:54.210> back<00:01:54.710> if<00:01:55.710> you<00:01:55.770> do<00:01:56.070> something<00:01:56.369> for<00:01:56.549> 10\\n\\n00:01:56.870 --> 00:01:56.880 align:start position:0%\\nit back if you do something for 10\\n \\n\\n00:01:56.880 --> 00:01:58.850 align:start position:0%\\nit back if you do something for 10\\nminutes<00:01:57.149> every<00:01:57.360> day<00:01:57.540> kind<00:01:58.259> of<00:01:58.350> like<00:01:58.560> brushing\\n\\n00:01:58.850 --> 00:01:58.860 align:start position:0%\\nminutes every day kind of like brushing\\n \\n\\n00:01:58.860 --> 00:02:01.160 align:start position:0%\\nminutes every day kind of like brushing\\nyour<00:01:59.070> teeth<00:01:59.280> it<00:02:00.000> gets<00:02:00.240> into<00:02:00.509> a<00:02:00.540> habit<00:02:00.930> and<00:02:01.110> you\\n\\n00:02:01.160 --> 00:02:01.170 align:start position:0%\\nyour teeth it gets into a habit and you\\n \\n\\n00:02:01.170 --> 00:02:02.719 align:start position:0%\\nyour teeth it gets into a habit and you\\ngo<00:02:01.320> you<00:02:01.380> get<00:02:01.619> up<00:02:01.740> you<00:02:01.890> brush<00:02:02.070> your<00:02:02.219> teeth<00:02:02.369> go<00:02:02.670> to\\n\\n00:02:02.719 --> 00:02:02.729 align:start position:0%\\ngo you get up you brush your teeth go to\\n \\n\\n00:02:02.729 --> 00:02:05.270 align:start position:0%\\ngo you get up you brush your teeth go to\\nveggie<00:02:03.000> brush<00:02:03.240> your<00:02:03.390> teeth<00:02:03.540> so<00:02:04.290> by<00:02:04.740> doing<00:02:04.799> 10\\n\\n00:02:05.270 --> 00:02:05.280 align:start position:0%\\nveggie brush your teeth so by doing 10\\n \\n\\n00:02:05.280 --> 00:02:06.080 align:start position:0%\\nveggie brush your teeth so by doing 10\\nminutes<00:02:05.520> every<00:02:05.820> day\\n\\n00:02:06.080 --> 00:02:06.090 align:start position:0%\\nminutes every day\\n \\n\\n00:02:06.090 --> 00:02:08.690 align:start position:0%\\nminutes every day\\nyou'll<00:02:06.810> get<00:02:06.990> into<00:02:07.109> the<00:02:07.259> habit<00:02:07.469> by<00:02:08.399> playing\\n\\n00:02:08.690 --> 00:02:08.700 align:start position:0%\\nyou'll get into the habit by playing\\n \\n\\n00:02:08.700 --> 00:02:10.759 align:start position:0%\\nyou'll get into the habit by playing\\nusing<00:02:09.420> just<00:02:09.750> that<00:02:09.899> short<00:02:10.140> of<00:02:10.289> a<00:02:10.379> short<00:02:10.590> amount\\n\\n00:02:10.759 --> 00:02:10.769 align:start position:0%\\nusing just that short of a short amount\\n \\n\\n00:02:10.769 --> 00:02:12.559 align:start position:0%\\nusing just that short of a short amount\\nof<00:02:10.920> time<00:02:11.129> you<00:02:11.310> can<00:02:11.340> really<00:02:11.700> focus<00:02:11.910> on<00:02:12.239> how<00:02:12.390> your\\n\\n00:02:12.559 --> 00:02:12.569 align:start position:0%\\nof time you can really focus on how your\\n \\n\\n00:02:12.569 --> 00:02:13.520 align:start position:0%\\nof time you can really focus on how your\\nbody's<00:02:12.870> anemic\\n\\n00:02:13.520 --> 00:02:13.530 align:start position:0%\\nbody's anemic\\n \\n\\n00:02:13.530 --> 00:02:15.050 align:start position:0%\\nbody's anemic\\nnow<00:02:14.010> some<00:02:14.220> of<00:02:14.280> the<00:02:14.340> exercises<00:02:14.790> that<00:02:14.940> I\\n\\n00:02:15.050 --> 00:02:15.060 align:start position:0%\\nnow some of the exercises that I\\n \\n\\n00:02:15.060 --> 00:02:16.699 align:start position:0%\\nnow some of the exercises that I\\nrecommend<00:02:15.540> that<00:02:15.599> you<00:02:15.780> do<00:02:15.989> within<00:02:16.290> those<00:02:16.440> 10\\n\\n00:02:16.699 --> 00:02:16.709 align:start position:0%\\nrecommend that you do within those 10\\n \\n\\n00:02:16.709 --> 00:02:20.270 align:start position:0%\\nrecommend that you do within those 10\\nminutes<00:02:16.860> first<00:02:17.849> one<00:02:18.120> is<00:02:18.330> the<00:02:18.720> squat<00:02:19.280> sitting\\n\\n00:02:20.270 --> 00:02:20.280 align:start position:0%\\nminutes first one is the squat sitting\\n \\n\\n00:02:20.280 --> 00:02:22.759 align:start position:0%\\nminutes first one is the squat sitting\\nin<00:02:20.489> a<00:02:20.580> squat<00:02:20.940> not<00:02:21.480> bobbing<00:02:22.019> up<00:02:22.080> and<00:02:22.230> down<00:02:22.349> just\\n\\n00:02:22.759 --> 00:02:22.769 align:start position:0%\\nin a squat not bobbing up and down just\\n \\n\\n00:02:22.769 --> 00:02:24.589 align:start position:0%\\nin a squat not bobbing up and down just\\nbeing<00:02:23.010> able<00:02:23.190> to<00:02:23.370> function<00:02:24.000> to<00:02:24.209> sit<00:02:24.420> in<00:02:24.540> the\\n\\n00:02:24.589 --> 00:02:24.599 align:start position:0%\\nbeing able to function to sit in the\\n \\n\\n00:02:24.599 --> 00:02:27.229 align:start position:0%\\nbeing able to function to sit in the\\nsquat<00:02:24.930> something<00:02:25.709> again<00:02:26.010> is<00:02:26.250> innate<00:02:26.400> to<00:02:26.819> us<00:02:26.849> as\\n\\n00:02:27.229 --> 00:02:27.239 align:start position:0%\\nsquat something again is innate to us as\\n \\n\\n00:02:27.239 --> 00:02:28.729 align:start position:0%\\nsquat something again is innate to us as\\nhuman<00:02:27.569> beings<00:02:27.599> and<00:02:28.049> something<00:02:28.440> that<00:02:28.530> you<00:02:28.650> are\\n\\n00:02:28.729 --> 00:02:28.739 align:start position:0%\\nhuman beings and something that you are\\n \\n\\n00:02:28.739 --> 00:02:31.670 align:start position:0%\\nhuman beings and something that you are\\nable<00:02:28.890> to<00:02:29.220> do<00:02:29.910> as<00:02:30.090> a<00:02:30.120> child<00:02:30.209> so<00:02:30.870> spend<00:02:31.440> 10\\n\\n00:02:31.670 --> 00:02:31.680 align:start position:0%\\nable to do as a child so spend 10\\n \\n\\n00:02:31.680 --> 00:02:34.640 align:start position:0%\\nable to do as a child so spend 10\\nminutes<00:02:32.239> every<00:02:33.239> day<00:02:33.450> getting<00:02:34.110> your<00:02:34.410> squat\\n\\n00:02:34.640 --> 00:02:34.650 align:start position:0%\\nminutes every day getting your squat\\n \\n\\n00:02:34.650 --> 00:02:37.360 align:start position:0%\\nminutes every day getting your squat\\nback\\n\\n00:02:37.360 --> 00:02:37.370 align:start position:0%\\n \\n \\n\\n00:02:37.370 --> 00:02:51.610 align:start position:0%\\n \\n[Music]\\n\\n00:02:51.610 --> 00:02:51.620 align:start position:0%\\n \\n \\n\\n00:02:51.620 --> 00:02:56.310 align:start position:0%\\n \\n[Music]\\n\\n\", transcript=\"00:00:02\\nhi I'm Roger Frampton and I'm a movement\\n00:00:05\\ncoach from London and I'm author of the\\n00:00:07\\nbook the flexible body so this position\\n00:00:10\\nis the squat\\n00:00:11\\nmost people when I talk about the squat\\n00:00:13\\nthinking when I'm forming up and down\\n00:00:14\\nexercise it works your legs and bar\\n00:00:16\\nwell the squat is actually a position\\n00:00:19\\nthat we're designed to defecate in every\\n00:00:22\\nkid sits and rests in this position and\\n00:00:25\\nif we look at Western people I think\\n00:00:28\\nmost people end up sitting up on their\\n00:00:31\\ntoes more in this position while I'm\\n00:00:33\\nadvising is that you get this position\\n00:00:35\\nback not for an exercise necessarily not\\n00:00:38\\nbecause of fitness or to be fitter but\\n00:00:40\\njust because you're designed to do it\\n00:00:43\\nthis here is just the human resting\\n00:00:45\\nposition so when I run classes people\\n00:00:48\\nalways talk about the squat how they\\n00:00:50\\ncan't quite get their heel down on the\\n00:00:51\\nfloor now the reason for that is because\\n00:00:53\\nsince about the age of four years old\\n00:00:55\\nyou've been wearing shoes we are an\\n00:00:58\\nanimal\\nwe're designed to walk around on bare\\n00:01:00\\nfeet and the reason that we'll lose our\\n00:01:02\\nsquat mobility is simply because we come\\n00:01:05\\nto become tight in our ankles and\\n00:01:07\\ntherefore you can't get all the way down\\n00:01:09\\nto the bottom now all you need to do is\\n00:01:11\\n00:01:12\\nif you put your heels on a yoga block or\\n00:01:14\\na book you're in that position rather\\n00:01:16\\nthan that position that will take away\\n00:01:19\\nyour ankle mobility and allow you to get\\n00:01:21\\nall the way down now you can use a yoga\\n00:01:23\\nblock or a book or two or three of them\\n00:01:24\\nif you're a girl squatting in heels it's\\n00:01:27\\nso much easier and squatting flat on the\\n00:01:29\\nfloor so what you do is you start on\\n00:01:31\\nwhatever angle you're comfortable in and\\n00:01:33\\nyou work your way down to becoming flat\\n00:01:36\\non the floor so really a squat is just a\\n00:01:39\\ntest of your ankles if you've got good\\n00:01:42\\nankles you're great at squatting and if\\n00:01:45\\nyour ankles have become tight then squat\\n00:01:47\\nis really difficult and hard but like\\n00:01:49\\nevery other muscle and joint in the body\\n00:01:51\\nis trainable and you've got time to get\\n00:01:54\\nit back if you do something for 10\\n00:01:56\\nminutes every day kind of like brushing\\n00:01:58\\nyour teeth it gets into a habit and you\\n00:02:01\\ngo you get up you brush your teeth go to\\n00:02:02\\nveggie brush your teeth so by doing 10\\n00:02:05\\nminutes every day\\n00:02:06\\nyou'll get into the habit by playing\\n00:02:08\\nusing just that short of a short amount\\n00:02:10\\nof time you can really focus on how your\\n00:02:12\\nbody's anemic\\n00:02:13\\nnow some of the exercises that I\\n00:02:15\\nrecommend that you do within those 10\\n00:02:16\\nminutes first one is the squat sitting\\n00:02:20\\nin a squat not bobbing up and down just\\n00:02:22\\nbeing able to function to sit in the\\n00:02:24\\nsquat something again is innate to us as\\n00:02:27\\nhuman beings and something that you are\\n00:02:28\\nable to do as a child so spend 10\\n00:02:31\\nminutes every day getting your squat\\n00:02:34\\nback\\n00:02:37\\n[Music]\\n00:02:51\")],\n", + " 'clip_text_prompts': ['person doing squats'],\n", + " 'segment_infos': [SegmentInfo(start_timestamp='00:00:00.000', end_timestamp='00:00:05.000', fps=25.0, video_id='IB_icWRzi4E'),\n", + " SegmentInfo(start_timestamp='00:00:09.000', end_timestamp='00:00:24.000', fps=25.0, video_id='IB_icWRzi4E'),\n", + " SegmentInfo(start_timestamp='00:00:29.000', end_timestamp='00:00:45.000', fps=25.0, video_id='IB_icWRzi4E'),\n", + " SegmentInfo(start_timestamp='00:00:49.000', end_timestamp='00:00:53.000', fps=25.0, video_id='IB_icWRzi4E'),\n", + " SegmentInfo(start_timestamp='00:01:02.000', end_timestamp='00:01:06.000', fps=25.0, video_id='IB_icWRzi4E'),\n", + " SegmentInfo(start_timestamp='00:01:43.000', end_timestamp='00:01:49.000', fps=25.0, video_id='IB_icWRzi4E'),\n", + " SegmentInfo(start_timestamp='00:02:18.000', end_timestamp='00:02:27.000', fps=25.0, video_id='IB_icWRzi4E'),\n", + " SegmentInfo(start_timestamp='00:00:01.000', end_timestamp='00:00:44.000', fps=25.0, video_id='xqvCmoLULNY')]}" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "graph.get_state(thread).values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null, From b419801cadd9b3d1074fd3e4995969130aeb8db6 Mon Sep 17 00:00:00 2001 From: deepbuzin Date: Tue, 27 Aug 2024 12:21:55 +0000 Subject: [PATCH 5/9] Fix agent nodes through annotations generation --- agent.ipynb | 877 ++++++++++++++++++++++++---------------------------- 1 file changed, 402 insertions(+), 475 deletions(-) diff --git a/agent.ipynb b/agent.ipynb index 3e17daa..979ea0d 100644 --- a/agent.ipynb +++ b/agent.ipynb @@ -20,7 +20,8 @@ "import operator\n", "\n", "from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage\n", - "from langchain.pydantic_v1 import BaseModel, Field" + "from langchain.pydantic_v1 import BaseModel, Field\n", + "from langchain_core.prompts import ChatPromptTemplate" ] }, { @@ -52,44 +53,94 @@ " transcript: str\n", "\n", "\n", - "class SegmentInfo(BaseModel): # , Generic[OutputSchema]):\n", + "class SegmentInfo(BaseModel):\n", " start_timestamp: str\n", " end_timestamp: str\n", " fps: float\n", - " # segment_info: Optional[OutputSchema]\n", " video_id: str\n", - " # _frames: Optional[\n", - " # list[np.array]\n", - " # ] # List of raw frames that got into LLM. Added for debugging purposes.\n", - "\n", - " # @classmethod\n", - " # def from_frames(cls, start_frame, end_frame, fps, **kwargs):\n", - " # return cls(\n", - " # start_timestamp=seconds_to_ts(start_frame / fps),\n", - " # end_timestamp=seconds_to_ts(end_frame / fps),\n", - " # fps=fps,\n", - " # **kwargs,\n", - " # )\n", - "\n", - " # @classmethod\n", - " # def from_seconds(cls, start_seconds, end_seconds, **kwargs):\n", - " # return cls(\n", - " # start_timestamp=seconds_to_ts(start_seconds),\n", - " # end_timestamp=seconds_to_ts(end_seconds),\n", - " # **kwargs,\n", - " # )\n", - "\n", - " # def to_str(self, skip: list[str] = []):\n", - " # # skip -> fields from segment_info\n", - " # # dict() works both with pydantic model and with with unparsed dict\n", - " # if self.segment_info:\n", - " # d = dict(self.segment_info)\n", - " # for s in skip:\n", - " # del d[s]\n", - " # d = \": \" + json.dumps(d)\n", - " # else:\n", - " # d = \"\"\n", - " # return f\"{self.start_timestamp}-{self.end_timestamp}{d}\"" + "\n", + "\n", + "class LocalClue(BaseModel):\n", + " \"\"\"Local clues for a segment\"\"\"\n", + "\n", + " id: str = Field(description=\"LC1,LC2...\")\n", + " quote: str = Field(\n", + " description=\"the quote from the transcript that was used to create this clue.\"\n", + " )\n", + " quote_timestamp_start: str = Field(\n", + " description=\"the exact start timestamp of the quote.\"\n", + " )\n", + " quote_timestamp_end: str = Field(\n", + " description=\"the exact end timestamp of the quote.\"\n", + " )\n", + " clue: str = Field(description=\"the main clue data\")\n", + "\n", + "\n", + "class GlobalClue(BaseModel):\n", + " \"\"\"Global clues for a segment\"\"\"\n", + "\n", + " id: str = Field(description=\"GC1,GC2...\")\n", + " quote: str = Field(\n", + " description=\"the quote from the transcript that was used to create this clue.\"\n", + " )\n", + " quote_timestamp_start: str = Field(\n", + " description=\"the exact start timestamp of the quote.\"\n", + " )\n", + " quote_timestamp_end: str = Field(\n", + " description=\"the exact end timestamp of the quote.\"\n", + " )\n", + " clue: str = Field(description=\"the main clue data.\")\n", + " relevance_to_segment: str = Field(\n", + " description=\"why do you think this global clue is relevant to the segment you are working with right now.\"\n", + " )\n", + "\n", + "\n", + "class LogicalInference(BaseModel):\n", + " \"\"\"Logical inferences for a segment\"\"\"\n", + "\n", + " id: str = Field(description=\"LI1,LI2,...\")\n", + " description: str = Field(description=\"A concise form of the logical inference.\")\n", + " details: str = Field(\n", + " description=\"A verbose explanation of what insight about what happens in this segment should be made based on the clues that you found.\"\n", + " )\n", + "\n", + "\n", + "class SegmentAnnotation(BaseModel):\n", + " local_clues: list[LocalClue] = Field(\n", + " description=\"Local clues are inside the segment in terms of timestamps.\"\n", + " )\n", + " global_clues: list[GlobalClue] = Field(\n", + " description=\"Global clues are scattered across the entire transcript.\"\n", + " )\n", + " logical_inferences: list[LogicalInference] = Field(\n", + " description=\"What can we infer about the topic, that the user is looking for in the video, can we make based on the clues inside this segment\"\n", + " )\n", + "\n", + "\n", + "class SegmentWithClueInfo(BaseModel):\n", + " \"\"\"\n", + " Annotation for a video segment.\n", + " \"\"\"\n", + "\n", + " start_timestamp: str = Field(\n", + " description=\"start timestamp of the segment in format HH:MM:SS.MS\"\n", + " )\n", + " end_timestamp: str = Field(\n", + " description=\"start timestamp of the segment in format HH:MM:SS.MS\"\n", + " )\n", + " segment_annotation: SegmentAnnotation = Field(\n", + " description=\"list of annotations for the segment\"\n", + " )\n", + "\n", + "\n", + "class VideoAnnotation(BaseModel):\n", + " \"\"\"\n", + " Segments of a video.\n", + " \"\"\"\n", + "\n", + " segments: list[SegmentWithClueInfo] = Field(\n", + " description=\"information about each segment\"\n", + " )" ] }, { @@ -100,15 +151,16 @@ "source": [ "# 2. Create the state\n", "\n", + "\n", "class AgentState(TypedDict):\n", - "\ttask: str\n", - "\tsearch_queries: List[str]\n", - "\tvideo_ids: List[str]\n", - "\tvideo_infos: List[VideoInfo]\n", - "\tclip_text_prompts: List[str]\n", - "\tsegment_infos: List[SegmentInfo]\n", - "\tclues = List[str]\n", - "\tannotations = List[str]" + " task: str\n", + " search_queries: List[str]\n", + " video_ids: List[str]\n", + " video_infos: List[VideoInfo]\n", + " clip_text_prompts: List[str]\n", + " segment_infos: List[SegmentInfo]\n", + " clues: List[str]\n", + " annotations: List[str]" ] }, { @@ -174,50 +226,166 @@ "3. *Logical inferences* that could help the user understand how the thing they are looking for happens inside the segment. Logical inferences for a segment are deducted from local and global clues for this segment.\n", "\n", "!!!IT IS EXTREMELY IMPORTANT TO DELIVER ALL THREE THINGS!!!\n", - "\"\"\"\n", - "\n", - "# also MANY a structured output prompt\n", - "\n", - "# EXTRACT_CLUES_PROMPT = \"\"\"\n", - "# \"User's instructions: The provided video is a tutorial about how to perform squats.\n", - "\n", - "# I need to understand HOW THE PERSON SHOWN IN EACH SEGMENT PERFORMS SQUATS IN THIS SEGMENT.\n", - "# What is done correctly.\n", - "# What mistakes they make. Why these mistakes happen.\n", - "# How these mistakes could be improved.\n", - "\n", - "# It is very improtant that the information that you provide would describe how the person shown in the segment is doing squats, and not some generic advice that is unrelated to the visual information.\n", - "# \"\"\"\n", - "\n", - "# prompt.append('Segment timecodes and optional additional information:\\n' + '\\n'.join([s.to_str(skip=[filter_by] if filter_by else []) for s in video_segments_part]))\n", - "# prompt.append('Transcript:\\n' + transcript)\n", "\n", + " Good local clues examples: [\n", + " {\n", + " \"id\": \"LC1\",\n", + " \"timestamp\": \"00:00:19\",\n", + " \"quote\": \"exercises do them wrong and instead of\",\n", + " \"clue\": \"This phrase introduces the concept of incorrect exercise form, setting the stage for a demonstration of improper technique.\"\n", + " },\n", + " {\n", + " \"id\": \"LC2\",\n", + " \"timestamp\": \"00:00:21\",\n", + " \"quote\": \"growing nice quads and glutes you'll\",\n", + " \"clue\": \"Mentions the expected benefits of proper squats (muscle growth), implying that these benefits won't be achieved with incorrect form.\"\n", + " },\n", + " {\n", + " \"id\": \"LC3\",\n", + " \"timestamp\": \"00:00:22\",\n", + " \"quote\": \"feel aches and pains in your knees your\",\n", + " \"clue\": \"Directly states negative consequences of improper form, strongly suggesting that this segment demonstrates incorrect technique.\"\n", + " },\n", + " {\n", + " \"id\": \"LC4\",\n", + " \"timestamp\": \"00:00:24\",\n", + " \"quote\": \"lower back and even your shoulders\",\n", + " \"clue\": \"Continuation of LC3, emphasizing multiple areas of potential pain from improper form.\"\n", + " },\n", + " {\n", + " \"id\": \"LC5\",\n", + " \"timestamp\": \"00:00:26\",\n", + " \"quote\": \"let's see how to do it correctly\",\n", + " \"clue\": \"This phrase suggests a transition is about to occur. The incorrect form has been shown, and correct form will follow.\"\n", + " }\n", + " ]\n", "\n", - "GEN_ANNOTATIONS_PROMPT = \"\"\"You are a helpful assistant that performs high quality data investigation and transformation.\n", - " You will be given a JSON object with clues and other helpful information about what's going on \n", - " in a specific part of a video file. This part is called a segment. Your job is to:\n", - " 1. Read this JSON object carefully\n", - " 2. Answer user's questions about this segment\n", - " 3. Provide the answer as a JSON object in a schema provided by the user\n", - " Important rules:\n", - " 1. You can only rely on data presented in a provided JSON object. Don't improvise.\n", - " 2. Follow user's request carefully.\n", - " 3. Don't rush to deliver the answer. Take some time to think. Make a deep breath. Then start writing.\n", - " 4. If you want to output field as empty (null), output it as JSON null (without quotes), not as a string \"null\". \n", + " Double check that the timestamp and the quote that you provide exactly correspond to what you found in the transcript.\n", + " For example, if the transcript says:\n", + " \"00:05:02\n", + " he took the glasses\n", + " 00:05:04\n", + " and gave them to me\"\n", + " Then a GOOD output will be:\n", + " - timestamp: 00:05:03\n", + " - quote: \"he took the glasses and gave them to me\"\n", + " And a BAD output would be:\n", + " - timestamp: 00:04:02\n", + " - quote: \"he gave me the glasses\"\n", + "\n", + " Good global clues examples: [\n", + " {\n", + " \"id\": \"GC1\",\n", + " \"timestamp\": \"00:01:15\",\n", + " \"quote\": \"Before we dive into specific techniques, let's talk about safety.\",\n", + " \"clue\": \"Introduces the theme of safety in squatting.\",\n", + " \"relevance_to_segment\": \"This earlier emphasis on safety provides context for why proper depth is important and why it's being addressed in our segment. It connects to the fear of knee pain mentioned in LC3.\"\n", + " },\n", + " {\n", + " \"id\": \"GC2\",\n", + " \"timestamp\": \"00:02:30\",\n", + " \"quote\": \"Squatting is a fundamental movement pattern in everyday life.\",\n", + " \"clue\": \"Emphasizes the importance of squats beyond just exercise.\",\n", + " \"relevance_to_segment\": \"This broader context heightens the importance of learning proper squat depth as demonstrated in our segment. It suggests that the techniques shown have applications beyond just gym workouts.\"\n", + " },\n", + " {\n", + " \"clue_id\": \"GC3\",\n", + " \"timestamp\": \"00:05:20\",\n", + " \"quote\": \"If you have existing knee issues, consult a physician before attempting deep squats.\",\n", + " \"clue\": \"Provides a health disclaimer related to squat depth.\",\n", + " \"relevance_to_segment\": \"While this comes after our segment, it's relevant because it addresses the concern about knee pain mentioned in LC3. It suggests that the demonstration in our segment is generally safe but acknowledges individual variations.\"\n", + " },\n", + " {\n", + " \"clue_id\": \"GC4\",\n", + " \"timestamp\": \"00:06:45\",\n", + " \"quote\": \"Proper depth ensures full engagement of your quadriceps and glutes.\",\n", + " \"clue\": \"Explains the benefit of correct squat depth.\",\n", + " \"relevance_to_segment\": \"This later explanation provides justification for the depth guideline given in LC4. It helps viewers understand why the demonstrated technique is important.\"\n", + " },\n", + " {\n", + " \"clue_id\": \"GC5\",\n", + " \"timestamp\": \"00:00:30\",\n", + " \"quote\": \"Today, we'll cover squat variations for beginners to advanced lifters.\",\n", + " \"clue\": \"Outlines the scope of the entire video.\",\n", + " \"relevance_to_segment\": \"This early statement suggests that our segment, focusing on proper depth, is part of a comprehensive guide. It implies that the demonstration might be adaptable for different skill levels.\"\n", + " }\n", + " ]\n", + " Double check that the timestamp and the quote that you provide exactly correspond to what you found in the transcript.\n", + " For example, if the transcript says:\n", + " \"00:05:02\n", + " he took the glasses\n", + " 00:05:04\n", + " and gave them to me\"\n", + " Then a GOOD output will be:\n", + " - timestamp: 00:05:03\n", + " - quote: \"he took the glasses and gave them to me\"\n", + " And a BAD output would be:\n", + " - timestamp: 00:04:02\n", + " - quote: \"he gave me the glasses\"\n", + " \n", + "\n", + " Good logical inference examples:\n", + " [\n", + " {\n", + " \"id\": \"LI1\",\n", + " \"description\": \"Primary Demonstration of Heel Lift\",\n", + " \"details\": \"Given that GC1-GC3 describe the 'most common mistake' as heels lifting off the ground, and this description immediately precedes our segment, it's highly probable that this is the primary error being demonstrated. This is further supported by the segment's focus on incorrect form (LC1-LC4).\"\n", + " },\n", + " {\n", + " \"id\": \"LI2\",\n", + " \"description\": \"Multiple Error Demonstration\",\n", + " \"details\": \"While heel lift is likely the primary focus, the mention of multiple pain points (knees, lower back, shoulders in LC3-LC4) suggests that the demonstrator may be exhibiting several forms of incorrect technique simultaneously. This comprehensive 'what not to do' approach would be pedagogically effective.\"\n", + " },\n", + " {\n", + " \"id\": \"LI3\",\n", + " \"description\": \"Possible Inclusion of 'Butt Wink'\",\n", + " \"details\": \"Although 'butt wink' is mentioned after our segment (GC4-GC6), its connection to back pain (which is mentioned in LC4) raises the possibility that this error is also present in the demonstration. The instructor may be showing multiple errors early on, then breaking them down individually later.\"\n", + " },\n", + " {\n", + " \"id\": \"LI4\",\n", + " \"description\": \"Segment Placement in Overall Video Structure\",\n", + " \"details\": \"The segment's position (starting at 00:00:19) and the phrase 'let's see how to do it correctly' (LC5) at the end suggest this is an early, foundational part of the video. It likely serves to grab attention by showing common mistakes before transitioning to proper form instruction.\"\n", + " },\n", + " {\n", + " \"id\": \"LI5\",\n", + " \"description\": \"Intentional Exaggeration of Errors\",\n", + " \"details\": \"Given the educational nature of the video, it's plausible that the demonstrator is intentionally exaggerating the incorrect form. This would make the errors more obvious to viewers and enhance the contrast with correct form shown later.\"\n", + " }\n", + " ]\n", "\"\"\"\n", "\n", "\n", - "# human_prompt = \"\"\"\n", - "# You are given a JSON object that contains clues about segments of a video with timecodes.\n", - "# !!!! For each segment provided in a JSON object you need to answer on the following questions:\n", - "# 1. Given the data found in the JSON object, what is a probability that this part contains a footage of a person doing squats? [the answer could be only \"high\", \"medium\", \"low\", or null (if impossible to infer from the provided data)]\n", - "# 2. Given the data found in the JSON object and even if the answer on the previous question is \"low\", does this person do squats right, wrong, or mixed? [the answer could be only \"right\", \"wrong\", \"mixed\", or null (if impossible to infer from the provided data)]\n", - "# 3. Given the data found in the JSON object, what exactly does thing person do right and/or wrong regarding their squats technique? [the answer should be clear and focused on body parts]\n", - "# 4. If the answer on the previous question contains description of wrong technique, explain how to fix these mistakes using your \"own knowledge\" like you are a sports coach.\n", - "# \"\"\"\n", - "\n", - "# for clue in clues_part:\n", - "# prompt.append(\"Segment:\\n\" + json.dumps(clue))" + "GEN_ANNOTATIONS_PROMPT = \"\"\"You are a helpful assistant that performs high quality data investigation and transformation.\n", + " You will be given a JSON object with clues and other helpful information about what's going on \n", + " in a specific part of a video file. This part is called a segment. Your job is to:\n", + " 1. Read this JSON object carefully\n", + " 2. Answer user's questions about this segment\n", + " 3. Provide the answer as a JSON object in a schema provided by the user\n", + " Important rules:\n", + " 1. You can only rely on data presented in a provided JSON object. Don't improvise.\n", + " 2. Follow user's request carefully.\n", + " 3. Don't rush to deliver the answer. Take some time to think. Make a deep breath. Then start writing.\n", + " 4. If you want to output field as empty (null), output it as JSON null (without quotes), not as a string \"null\". \n", + "—> GOOD EXAMPLES:\n", + " \"wrong\":\"Knees caving in: This can stress the knees and reduce effectiveness\"\n", + " \"correction\":\"Focus on keeping knees aligned with your toes.\"\n", + " \"wrong\":\"Rounding the back: This increases the risk of back injuries\"\n", + " \"correction\":\"Keep your chest up and maintain a neutral spine throughout the movement.\"\n", + " \"wrong\":\"Heels are lifting off the ground: this shifts the weight forward, reducing stability\"\n", + " \"correction\":\" Keep your weight on your heels and press through them as you rise.\"\n", + " \"right\":\"Chest and shoulders: The chest is up, and the shoulders are back, maintaining an upright torso.\"\n", + " \"correction\":null\n", + "—> BAD EXAMPLES:\n", + " \"wrong\":\"knees\"\n", + " \"correction\":\"fix knees\"\n", + " \"wrong\":\"back looks funny\"\n", + " \"correction\":\"make back better\"\n", + " \"wrong\":\"feet are doing something\"\n", + " \"correction\":\"feet should be different\"\n", + " \"right\":\"arms\"\n", + " \"correction\":\"arms are fine i think\"\n", + "—> BAD EXAMPLES END HERE\n", + "\"\"\"" ] }, { @@ -225,29 +393,6 @@ "execution_count": 7, "metadata": {}, "outputs": [], - "source": [ - "# from datagen import DatagenConfig, get_video_ids, download_videos, detect_segments_clip, generate_clues, generate_annotations\n", - "\n", - "# config_params = {\n", - "# \"openai\": {\n", - "# \"type\": \"azure\", # openai/azure\n", - "# \"temperature\": \"1\",\n", - "# \"deployment\": \"gpt4o\", # model for openai / deployment for azure\n", - "# },\n", - "# \"data_dir\": \"./tmp/squats\",\n", - "# }\n", - "\n", - "# !mkdir -p {config_params[\"data_dir\"]}\n", - "\n", - "# # this config handles all the bookeeping so you need to pass it everywhere.\n", - "# config = DatagenConfig(**config_params)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], "source": [ "import scrapetube\n", "import yt_dlp\n", @@ -264,7 +409,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -293,13 +438,8 @@ " fps = vr.get_avg_fps()\n", " frame_indices = range(0, num_frames, round(fps))\n", "\n", - " # print(f\"Num frames: {num_frames}, fps: {fps}\")\n", - " # print(f\"Len frame indices: {len(frame_indices)}\")\n", - "\n", - " # frames = vr.get_batch(frame_indices)\n", - "\n", " for frame_idx in frame_indices:\n", - " print(f\"Frame idx {frame_idx}\")\n", + " # print(f\"Frame idx {frame_idx}\")\n", " frame = vr[frame_idx].asnumpy()\n", " yield {\n", " \"frame\": frame,\n", @@ -307,9 +447,6 @@ " \"video_id\": video_idx,\n", " }\n", "\n", - " # print(\"video done\")\n", - " # print(\"all videos done\")\n", - "\n", " def __next__(self):\n", " return next(self.frame_generator)\n", "\n", @@ -319,7 +456,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -465,9 +602,6 @@ " return {\"video_infos\": video_infos}\n", "\n", "\n", - "DATAFRAME = None\n", - "\n", - "\n", "def detect_segments_node(state: AgentState):\n", "\n", " LOCAL_ROOT = Path(\"./tmp/agent_squats\").resolve()\n", @@ -482,31 +616,6 @@ "\n", " dataset = VideoInferenceDataset(video_infos, LOCAL_ROOT)\n", "\n", - " # # Define a `worker_init_fn` that configures each dataset copy differently\n", - " # def worker_init_fn(worker_id):\n", - " # worker_info = torch.utils.data.get_worker_info()\n", - " # worker_dataset = worker_info.dataset # the dataset copy in this worker process\n", - " # video_infos = worker_dataset.video_infos\n", - "\n", - " # chunk_size = math.ceil(len(video_infos) / worker_info.num_workers)\n", - " # video_infos_chunks = [\n", - " # video_infos[i : i + chunk_size]\n", - " # for i in range(0, len(video_infos), chunk_size)\n", - " # ]\n", - "\n", - " # worker_dataset.video_infos = video_infos_chunks[worker_info.id]\n", - "\n", - " # # print(worker_dataset.video_infos)\n", - " # print(f\"Worker {worker_info.id} initialized\")\n", - "\n", - " # configure the dataset to only process the split workload\n", - "\n", - " # per_worker = math.ceil(len(dataset) / float(worker_info.num_workers))\n", - "\n", - " # worker_id = worker_info.id\n", - " # dataset.start = overall_start + worker_id * per_worker\n", - " # dataset.end = min(dataset.start + per_worker, overall_end)\n", - "\n", " dataloader = torch.utils.data.DataLoader(\n", " dataset,\n", " num_workers=1,\n", @@ -534,12 +643,6 @@ " except StopIteration:\n", " break\n", "\n", - " # print(\"Batch fetched\")\n", - " # # print(batch)\n", - "\n", - " # time.sleep(30)\n", - "\n", - " start_time = time.time()\n", " inputs = processor(\n", " images=batch[\"frame\"],\n", " text=clip_text_prompts,\n", @@ -548,24 +651,12 @@ " truncation=True,\n", " )\n", " inputs = {k: v.to(\"cuda\") for k, v in inputs.items()}\n", - " # print(f\"Proc time: {time.time() - start_time:.2f} seconds\")\n", "\n", - " # print(\"Inputs prepared\")\n", - " # time.sleep(5)\n", - " # print(inputs[\"pixel_values\"].shape)\n", - " # print(inputs[\"input_ids\"].shape)\n", - "\n", - " start_time = time.time()\n", " outputs = model(**inputs)\n", - " # print(f\"Forward time: {time.time() - start_time:.2f} seconds\")\n", "\n", " logits = outputs.logits_per_image\n", " probs = torch.nn.functional.sigmoid(logits).detach().cpu().numpy()\n", "\n", - " # print(\"Forward pass complete\")\n", - " # print(f\"video_id {len(batch[\"video_id\"])}\")\n", - " # print(len(probs))\n", - "\n", " for video_idx, frame_idx, prob in zip(\n", " batch[\"video_id\"], batch[\"frame_idx\"], probs\n", " ):\n", @@ -576,10 +667,6 @@ " clip_results_dict[\"frame_idx\"].append(frame_idx.item())\n", " clip_results_dict[\"probs\"].append(prob.item())\n", "\n", - " # print(f\"Len clip results: {len(clip_results_dict['video_id'])}\")\n", - "\n", - " # print(\"Outputs parsed\")\n", - "\n", " print(\"All frames processed\")\n", " clip_results = pd.DataFrame(clip_results_dict)\n", " print(\"Dataframe created\")\n", @@ -608,7 +695,6 @@ " f\"%H:%M:%S.{round((s%1)*1000):03d}\", time.gmtime(s)\n", " )\n", "\n", - " \n", " for start, end in segments_start_end:\n", " segment_infos.append(\n", " SegmentInfo(\n", @@ -622,76 +708,99 @@ " return {\"segment_infos\": segment_infos}\n", "\n", "\n", - "# def extract_clues_node(state: AgentState):\n", - "# clues = []\n", - "\n", - "# clues = generate_clues(\n", - "# # video_ids=['byxWus7BwfQ'],\n", - "# config=config,\n", - "# human_prompt=human_prompt,\n", - "# segments_per_call=5, # the output might be quite long, so need to limit number of segments per gpt call to respect max output legnth\n", - "# raise_on_error=True, # interrupt when encountering an error. Useful for debugging.\n", - "# )\n", - "\n", - "# return {\"clues\": clues}\n", - "\n", - "\n", - "# def gen_annotations_node(state: AgentState):\n", - "\n", - "# class SegmentFeedback(BaseModel):\n", - "# \"\"\"\n", - "# —> GOOD EXAMPLES:\n", - "# \"wrong\":\"Knees caving in: This can stress the knees and reduce effectiveness\"\n", - "# \"correction\":\"Focus on keeping knees aligned with your toes.\"\n", - "# \"wrong\":\"Rounding the back: This increases the risk of back injuries\"\n", - "# \"correction\":\"Keep your chest up and maintain a neutral spine throughout the movement.\"\n", - "# \"wrong\":\"Heels are lifting off the ground: this shifts the weight forward, reducing stability\"\n", - "# \"correction\":\" Keep your weight on your heels and press through them as you rise.\"\n", - "# \"right\":\"Chest and shoulders: The chest is up, and the shoulders are back, maintaining an upright torso.\"\n", - "# \"correction\":null\n", - "# —> BAD EXAMPLES:\n", - "# \"wrong\":\"knees\"\n", - "# \"correction\":\"fix knees\"\n", - "# \"wrong\":\"back looks funny\"\n", - "# \"correction\":\"make back better\"\n", - "# \"wrong\":\"feet are doing something\"\n", - "# \"correction\":\"feet should be different\"\n", - "# \"right\":\"arms\"\n", - "# \"correction\":\"arms are fine i think\"\n", - "# —> BAD EXAMPLES END HERE\n", - "# \"\"\"\n", - "\n", - "# right: Optional[str] = Field(description=\"what was right in the performance\")\n", - "# wrong: Optional[str] = Field(description=\"what was wrong in the performance\")\n", - "# correction: Optional[str] = Field(\n", - "# description=\"how and in what ways it the performance could be improved\"\n", - "# )\n", - "\n", - "# # The segment timestamps are taken from the provided information.\n", - "# class SegmentAnnotation(BaseModel):\n", - "# squats_probability: Optional[str] = Field(\n", - "# description=\"how high is the probability that the person is doing squats in the segment: low, medium, high, unknown(null)\"\n", - "# )\n", - "# squats_technique_correctness: Optional[str] = Field(\n", - "# description=\"correctness of the squat technique.\"\n", - "# )\n", - "# squats_feedback: Optional[SegmentFeedback] = Field(\n", - "# description=\"what was right and wrong in the squat perfomance in the segment. When the technique is incorrect, provide instructions how to correct them.\"\n", - "# )\n", - "\n", - "# annotations = generate_annotations(\n", - "# human_prompt=human_prompt,\n", - "# config=config,\n", - "# segments_per_call=5,\n", - "# annotation_schema=SegmentAnnotation,\n", - "# )\n", - "\n", - "# return {\"annotations\": annotations}" + "def extract_clues_node(state: AgentState):\n", + "\n", + " prompt_template = ChatPromptTemplate.from_messages(\n", + " [\n", + " (\"system\", EXTRACT_CLUES_PROMPT),\n", + " (\n", + " \"user\",\n", + " \"Segment timecodes: {{ segment_timecodes }}\\nTranscript: {{ transcript }}\",\n", + " ),\n", + " ],\n", + " template_format=\"jinja2\",\n", + " )\n", + "\n", + " model = prompt_template | llm.with_structured_output(VideoAnnotation)\n", + "\n", + " segment_infos_dict = defaultdict(list)\n", + " for segment_info in state[\"segment_infos\"]:\n", + " segment_infos_dict[segment_info.video_id].append(segment_info)\n", + "\n", + " video_infos_dict = {\n", + " video_info.video_id: video_info for video_info in state[\"video_infos\"]\n", + " }\n", + "\n", + " clues = []\n", + "\n", + " for video_id, segment_infos in segment_infos_dict.items():\n", + " transcript = video_infos_dict[video_id].transcript\n", + " segment_infos_chunks = [\n", + " segment_infos[i : i + 5] for i in range(0, len(segment_infos), 5)\n", + " ]\n", + "\n", + " for chunk in segment_infos_chunks:\n", + " video_annotation: VideoAnnotation = model.invoke(\n", + " {\n", + " \"segment_timecodes\": \"\\n\".join(\n", + " [f\"{s.start_timestamp}-{s.end_timestamp}\" for s in chunk]\n", + " ),\n", + " \"transcript\": transcript,\n", + " }\n", + " )\n", + " clues.extend(video_annotation.segments)\n", + "\n", + " return {\"clues\": clues}\n", + "\n", + "\n", + "def gen_annotations_node(state: AgentState):\n", + " class SegmentFeedback(BaseModel):\n", + " right: Optional[str] = Field(description=\"what was right in the performance\")\n", + " wrong: Optional[str] = Field(description=\"what was wrong in the performance\")\n", + " correction: Optional[str] = Field(\n", + " description=\"how and in what ways it the performance could be improved\"\n", + " )\n", + "\n", + " # The segment timestamps are taken from the provided information.\n", + " class SegmentCompleteAnnotation(BaseModel):\n", + " squats_probability: Optional[str] = Field(\n", + " description=\"how high is the probability that the person is doing squats in the segment: low, medium, high, unknown(null)\"\n", + " )\n", + " squats_technique_correctness: Optional[str] = Field(\n", + " description=\"correctness of the squat technique.\"\n", + " )\n", + " squats_feedback: Optional[SegmentFeedback] = Field(\n", + " description=\"what was right and wrong in the squat perfomance in the segment. When the technique is incorrect, provide instructions how to correct them.\"\n", + " )\n", + "\n", + " prompt_template = ChatPromptTemplate.from_messages(\n", + " [\n", + " (\"system\", GEN_ANNOTATIONS_PROMPT),\n", + " (\"user\", \"Clues: {{ clues }}\"),\n", + " ],\n", + " template_format=\"jinja2\",\n", + " )\n", + "\n", + " model = prompt_template | llm.with_structured_output(SegmentCompleteAnnotation)\n", + "\n", + " clues = state[\"clues\"]\n", + "\n", + " annotations = []\n", + " for clue in clues:\n", + " segment_annotation: SegmentCompleteAnnotation = model.invoke(\n", + " {\"clues\": clue.json()}\n", + " )\n", + "\n", + " annotations.append(segment_annotation.json())\n", + "\n", + " print(annotations)\n", + "\n", + " return {\"annotations\": annotations}" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -708,7 +817,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -718,32 +827,30 @@ "builder.add_node(\"get_video_ids\", get_video_ids_node)\n", "builder.add_node(\"download\", download_node)\n", "builder.add_node(\"detect_segments\", detect_segments_node)\n", - "# builder.add_node(\"extract_clues\", extract_clues_node)\n", - "# builder.add_node(\"gen_annotations\", gen_annotations_node)\n", + "builder.add_node(\"extract_clues\", extract_clues_node)\n", + "builder.add_node(\"gen_annotations\", gen_annotations_node)\n", "\n", "builder.set_entry_point(\"generate_queries\")\n", "\n", "# builder.add_conditional_edges(\n", - "# \"generate\", \n", - "# should_continue, \n", + "# \"generate\",\n", + "# should_continue,\n", "# {END: END, \"reflect\": \"reflect\"}\n", "# )\n", "\n", "builder.add_edge(\"generate_queries\", \"get_video_ids\")\n", "builder.add_edge(\"get_video_ids\", \"download\")\n", "builder.add_edge(\"download\", \"detect_segments\")\n", - "builder.add_edge(\"detect_segments\", END)\n", - "\n", - "# builder.add_edge(\"detect_segments\", \"extract_clues\")\n", - "# builder.add_edge(\"extract_clues\", \"gen_annotations\")\n", - "# builder.add_edge(\"gen_annotations\", END)\n", + "builder.add_edge(\"detect_segments\", \"extract_clues\")\n", + "builder.add_edge(\"extract_clues\", \"gen_annotations\")\n", + "builder.add_edge(\"gen_annotations\", END)\n", "\n", "graph = builder.compile(checkpointer=memory)" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -753,233 +860,8 @@ "{'generate_queries': {'search_queries': ['how to do squats', 'squat exercise tutorial']}}\n", "{'get_video_ids': {'video_ids': ['xqvCmoLULNY', 'IB_icWRzi4E']}}\n", "Downloaded video ids: ['IB_icWRzi4E', 'xqvCmoLULNY']\n", - "{'download': {'video_infos': [VideoInfo(video_id='xqvCmoLULNY', url='https://www.youtube.com/watch?v=xqvCmoLULNY', relative_video_path='videos/xqvCmoLULNY.mp4', subs=\"WEBVTT\\nKind: captions\\nLanguage: en\\n\\n00:00:00.160 --> 00:00:01.829 align:start position:0%\\n \\nlet's<00:00:00.399> learn<00:00:00.560> how<00:00:00.719> to<00:00:00.880> properly<00:00:01.280> perform<00:00:01.760> a\\n\\n00:00:01.829 --> 00:00:01.839 align:start position:0%\\nlet's learn how to properly perform a\\n \\n\\n00:00:01.839 --> 00:00:02.790 align:start position:0%\\nlet's learn how to properly perform a\\nsquat\\n\\n00:00:02.790 --> 00:00:02.800 align:start position:0%\\nsquat\\n \\n\\n00:00:02.800 --> 00:00:04.470 align:start position:0%\\nsquat\\nstart<00:00:03.120> with<00:00:03.199> your<00:00:03.360> feet<00:00:03.679> slightly<00:00:04.080> wider<00:00:04.319> than\\n\\n00:00:04.470 --> 00:00:04.480 align:start position:0%\\nstart with your feet slightly wider than\\n \\n\\n00:00:04.480 --> 00:00:06.389 align:start position:0%\\nstart with your feet slightly wider than\\nshoulder<00:00:04.799> width<00:00:05.120> apart\\n\\n00:00:06.389 --> 00:00:06.399 align:start position:0%\\nshoulder width apart\\n \\n\\n00:00:06.399 --> 00:00:09.190 align:start position:0%\\nshoulder width apart\\ncross<00:00:06.799> your<00:00:07.040> arms<00:00:07.440> in<00:00:07.759> front\\n\\n00:00:09.190 --> 00:00:09.200 align:start position:0%\\ncross your arms in front\\n \\n\\n00:00:09.200 --> 00:00:11.270 align:start position:0%\\ncross your arms in front\\nso<00:00:09.440> touch<00:00:09.679> your<00:00:09.920> right<00:00:10.240> hand<00:00:10.559> to<00:00:10.719> your<00:00:10.960> left\\n\\n00:00:11.270 --> 00:00:11.280 align:start position:0%\\nso touch your right hand to your left\\n \\n\\n00:00:11.280 --> 00:00:13.350 align:start position:0%\\nso touch your right hand to your left\\nshoulder<00:00:12.080> and<00:00:12.320> vice<00:00:12.559> versa<00:00:13.040> pointing<00:00:13.280> your\\n\\n00:00:13.350 --> 00:00:13.360 align:start position:0%\\nshoulder and vice versa pointing your\\n \\n\\n00:00:13.360 --> 00:00:15.190 align:start position:0%\\nshoulder and vice versa pointing your\\nelbows<00:00:13.679> straight<00:00:13.920> ahead<00:00:14.559> now<00:00:14.719> from<00:00:14.960> here\\n\\n00:00:15.190 --> 00:00:15.200 align:start position:0%\\nelbows straight ahead now from here\\n \\n\\n00:00:15.200 --> 00:00:17.109 align:start position:0%\\nelbows straight ahead now from here\\nshift<00:00:15.440> your<00:00:15.679> weight<00:00:16.160> to<00:00:16.320> the<00:00:16.480> ball<00:00:16.720> of<00:00:16.880> your\\n\\n00:00:17.109 --> 00:00:17.119 align:start position:0%\\nshift your weight to the ball of your\\n \\n\\n00:00:17.119 --> 00:00:18.230 align:start position:0%\\nshift your weight to the ball of your\\nfeet\\n\\n00:00:18.230 --> 00:00:18.240 align:start position:0%\\nfeet\\n \\n\\n00:00:18.240 --> 00:00:21.109 align:start position:0%\\nfeet\\nand<00:00:18.400> bend<00:00:18.720> your<00:00:18.880> knees\\n\\n00:00:21.109 --> 00:00:21.119 align:start position:0%\\nand bend your knees\\n \\n\\n00:00:21.119 --> 00:00:23.189 align:start position:0%\\nand bend your knees\\nget<00:00:21.359> as<00:00:21.520> close<00:00:21.680> to<00:00:21.840> 90<00:00:22.160> degrees<00:00:22.480> as<00:00:22.640> you<00:00:22.800> can\\n\\n00:00:23.189 --> 00:00:23.199 align:start position:0%\\nget as close to 90 degrees as you can\\n \\n\\n00:00:23.199 --> 00:00:25.830 align:start position:0%\\nget as close to 90 degrees as you can\\nlooking<00:00:23.519> straight<00:00:23.840> ahead<00:00:24.480> and<00:00:24.720> from<00:00:24.960> here\\n\\n00:00:25.830 --> 00:00:25.840 align:start position:0%\\nlooking straight ahead and from here\\n \\n\\n00:00:25.840 --> 00:00:29.109 align:start position:0%\\nlooking straight ahead and from here\\npush<00:00:26.160> back<00:00:26.400> up<00:00:26.480> to<00:00:26.640> the<00:00:26.720> starting<00:00:27.039> position\\n\\n00:00:29.109 --> 00:00:29.119 align:start position:0%\\npush back up to the starting position\\n \\n\\n00:00:29.119 --> 00:00:30.150 align:start position:0%\\npush back up to the starting position\\nthis<00:00:29.359> is<00:00:29.439> going<00:00:29.599> to<00:00:29.679> be<00:00:29.760> great<00:00:30.000> for\\n\\n00:00:30.150 --> 00:00:30.160 align:start position:0%\\nthis is going to be great for\\n \\n\\n00:00:30.160 --> 00:00:31.830 align:start position:0%\\nthis is going to be great for\\nstrengthening\\n\\n00:00:31.830 --> 00:00:31.840 align:start position:0%\\nstrengthening\\n \\n\\n00:00:31.840 --> 00:00:34.069 align:start position:0%\\nstrengthening\\nyour<00:00:32.079> thighs<00:00:32.480> or<00:00:32.559> your<00:00:32.719> quadriceps<00:00:33.680> as<00:00:33.840> well\\n\\n00:00:34.069 --> 00:00:34.079 align:start position:0%\\nyour thighs or your quadriceps as well\\n \\n\\n00:00:34.079 --> 00:00:36.150 align:start position:0%\\nyour thighs or your quadriceps as well\\nas<00:00:34.239> your<00:00:34.480> butt<00:00:34.719> or<00:00:34.800> your<00:00:34.960> glutes\\n\\n00:00:36.150 --> 00:00:36.160 align:start position:0%\\nas your butt or your glutes\\n \\n\\n00:00:36.160 --> 00:00:37.750 align:start position:0%\\nas your butt or your glutes\\nand<00:00:36.399> it's<00:00:36.559> also<00:00:36.800> going<00:00:36.800> to<00:00:36.880> be<00:00:37.040> great<00:00:37.280> to<00:00:37.440> work\\n\\n00:00:37.750 --> 00:00:37.760 align:start position:0%\\nand it's also going to be great to work\\n \\n\\n00:00:37.760 --> 00:00:41.510 align:start position:0%\\nand it's also going to be great to work\\non<00:00:38.079> your<00:00:38.320> posture\\n\\n00:00:41.510 --> 00:00:41.520 align:start position:0%\\n \\n \\n\\n00:00:41.520 --> 00:00:45.840 align:start position:0%\\n \\nthat's<00:00:41.760> how<00:00:41.920> to<00:00:42.000> properly<00:00:42.399> perform<00:00:42.879> a<00:00:42.960> squat\\n\\n\", transcript=\"00:00:00\\nlet's learn how to properly perform a\\n00:00:01\\nsquat\\n00:00:02\\nstart with your feet slightly wider than\\n00:00:04\\nshoulder width apart\\n00:00:06\\ncross your arms in front\\n00:00:09\\nso touch your right hand to your left\\n00:00:11\\nshoulder and vice versa pointing your\\n00:00:13\\nelbows straight ahead now from here\\n00:00:15\\nshift your weight to the ball of your\\n00:00:17\\nfeet\\n00:00:18\\nand bend your knees\\n00:00:21\\nget as close to 90 degrees as you can\\n00:00:23\\nlooking straight ahead and from here\\n00:00:25\\npush back up to the starting position\\n00:00:29\\nthis is going to be great for\\n00:00:30\\nstrengthening\\n00:00:31\\nyour thighs or your quadriceps as well\\n00:00:34\\nas your butt or your glutes\\n00:00:36\\nand it's also going to be great to work\\n00:00:37\\non your posture\\n00:00:41\\nthat's how to properly perform a squat\"), VideoInfo(video_id='IB_icWRzi4E', url='https://www.youtube.com/watch?v=IB_icWRzi4E', relative_video_path='videos/IB_icWRzi4E.mp4', subs=\"WEBVTT\\nKind: captions\\nLanguage: en\\n\\n00:00:02.419 --> 00:00:05.150 align:start position:0%\\n \\nhi<00:00:03.419> I'm<00:00:03.810> Roger<00:00:03.990> Frampton<00:00:04.319> and<00:00:04.770> I'm<00:00:04.859> a<00:00:04.950> movement\\n\\n00:00:05.150 --> 00:00:05.160 align:start position:0%\\nhi I'm Roger Frampton and I'm a movement\\n \\n\\n00:00:05.160 --> 00:00:07.400 align:start position:0%\\nhi I'm Roger Frampton and I'm a movement\\ncoach<00:00:05.520> from<00:00:05.910> London<00:00:06.299> and<00:00:06.509> I'm<00:00:06.779> author<00:00:07.049> of<00:00:07.230> the\\n\\n00:00:07.400 --> 00:00:07.410 align:start position:0%\\ncoach from London and I'm author of the\\n \\n\\n00:00:07.410 --> 00:00:10.310 align:start position:0%\\ncoach from London and I'm author of the\\nbook<00:00:07.649> the<00:00:08.010> flexible<00:00:08.400> body<00:00:08.660> so<00:00:09.660> this<00:00:09.840> position\\n\\n00:00:10.310 --> 00:00:10.320 align:start position:0%\\nbook the flexible body so this position\\n \\n\\n00:00:10.320 --> 00:00:11.780 align:start position:0%\\nbook the flexible body so this position\\nis<00:00:10.860> the<00:00:11.099> squat\\n\\n00:00:11.780 --> 00:00:11.790 align:start position:0%\\nis the squat\\n \\n\\n00:00:11.790 --> 00:00:13.459 align:start position:0%\\nis the squat\\nmost<00:00:12.150> people<00:00:12.480> when<00:00:12.599> I<00:00:12.660> talk<00:00:12.870> about<00:00:12.990> the<00:00:13.200> squat\\n\\n00:00:13.459 --> 00:00:13.469 align:start position:0%\\nmost people when I talk about the squat\\n \\n\\n00:00:13.469 --> 00:00:14.780 align:start position:0%\\nmost people when I talk about the squat\\nthinking<00:00:13.799> when<00:00:13.920> I'm<00:00:14.009> forming<00:00:14.340> up<00:00:14.460> and<00:00:14.639> down\\n\\n00:00:14.780 --> 00:00:14.790 align:start position:0%\\nthinking when I'm forming up and down\\n \\n\\n00:00:14.790 --> 00:00:16.790 align:start position:0%\\nthinking when I'm forming up and down\\nexercise<00:00:15.299> it<00:00:15.509> works<00:00:15.750> your<00:00:15.960> legs<00:00:16.080> and<00:00:16.350> bar\\n\\n00:00:16.790 --> 00:00:16.800 align:start position:0%\\nexercise it works your legs and bar\\n \\n\\n00:00:16.800 --> 00:00:19.670 align:start position:0%\\nexercise it works your legs and bar\\nwell<00:00:17.640> the<00:00:17.789> squat<00:00:18.090> is<00:00:18.210> actually<00:00:18.570> a<00:00:18.680> position\\n\\n00:00:19.670 --> 00:00:19.680 align:start position:0%\\nwell the squat is actually a position\\n \\n\\n00:00:19.680 --> 00:00:22.609 align:start position:0%\\nwell the squat is actually a position\\nthat<00:00:20.520> we're<00:00:20.730> designed<00:00:21.029> to<00:00:21.180> defecate<00:00:21.660> in<00:00:21.689> every\\n\\n00:00:22.609 --> 00:00:22.619 align:start position:0%\\nthat we're designed to defecate in every\\n \\n\\n00:00:22.619 --> 00:00:25.810 align:start position:0%\\nthat we're designed to defecate in every\\nkid<00:00:22.830> sits<00:00:23.670> and<00:00:24.000> rests<00:00:24.600> in<00:00:24.900> this<00:00:25.140> position<00:00:25.560> and\\n\\n00:00:25.810 --> 00:00:25.820 align:start position:0%\\nkid sits and rests in this position and\\n \\n\\n00:00:25.820 --> 00:00:28.700 align:start position:0%\\nkid sits and rests in this position and\\nif<00:00:26.820> we<00:00:26.910> look<00:00:27.119> at<00:00:27.240> Western<00:00:27.480> people<00:00:28.050> I<00:00:28.230> think\\n\\n00:00:28.700 --> 00:00:28.710 align:start position:0%\\nif we look at Western people I think\\n \\n\\n00:00:28.710 --> 00:00:31.550 align:start position:0%\\nif we look at Western people I think\\nmost<00:00:28.890> people<00:00:29.400> end<00:00:30.029> up<00:00:30.240> sitting<00:00:30.510> up<00:00:31.199> on<00:00:31.410> their\\n\\n00:00:31.550 --> 00:00:31.560 align:start position:0%\\nmost people end up sitting up on their\\n \\n\\n00:00:31.560 --> 00:00:33.740 align:start position:0%\\nmost people end up sitting up on their\\ntoes<00:00:31.740> more<00:00:32.070> in<00:00:32.219> this<00:00:32.309> position<00:00:32.759> while<00:00:33.660> I'm\\n\\n00:00:33.740 --> 00:00:33.750 align:start position:0%\\ntoes more in this position while I'm\\n \\n\\n00:00:33.750 --> 00:00:35.510 align:start position:0%\\ntoes more in this position while I'm\\nadvising<00:00:34.290> is<00:00:34.410> that<00:00:34.530> you<00:00:34.649> get<00:00:34.890> this<00:00:35.070> position\\n\\n00:00:35.510 --> 00:00:35.520 align:start position:0%\\nadvising is that you get this position\\n \\n\\n00:00:35.520 --> 00:00:38.150 align:start position:0%\\nadvising is that you get this position\\nback<00:00:35.730> not<00:00:36.510> for<00:00:36.719> an<00:00:36.809> exercise<00:00:37.260> necessarily<00:00:38.010> not\\n\\n00:00:38.150 --> 00:00:38.160 align:start position:0%\\nback not for an exercise necessarily not\\n \\n\\n00:00:38.160 --> 00:00:40.610 align:start position:0%\\nback not for an exercise necessarily not\\nbecause<00:00:38.489> of<00:00:38.520> fitness<00:00:38.969> or<00:00:39.149> to<00:00:39.270> be<00:00:39.420> fitter<00:00:39.719> but\\n\\n00:00:40.610 --> 00:00:40.620 align:start position:0%\\nbecause of fitness or to be fitter but\\n \\n\\n00:00:40.620 --> 00:00:43.430 align:start position:0%\\nbecause of fitness or to be fitter but\\njust<00:00:40.920> because<00:00:41.520> you're<00:00:41.730> designed<00:00:42.120> to<00:00:43.110> do<00:00:43.260> it\\n\\n00:00:43.430 --> 00:00:43.440 align:start position:0%\\njust because you're designed to do it\\n \\n\\n00:00:43.440 --> 00:00:45.350 align:start position:0%\\njust because you're designed to do it\\nthis<00:00:43.710> here<00:00:43.980> is<00:00:44.219> just<00:00:44.399> the<00:00:44.610> human<00:00:44.789> resting\\n\\n00:00:45.350 --> 00:00:45.360 align:start position:0%\\nthis here is just the human resting\\n \\n\\n00:00:45.360 --> 00:00:48.229 align:start position:0%\\nthis here is just the human resting\\nposition<00:00:45.780> so<00:00:46.230> when<00:00:46.379> I<00:00:46.410> run<00:00:46.649> classes<00:00:47.250> people\\n\\n00:00:48.229 --> 00:00:48.239 align:start position:0%\\nposition so when I run classes people\\n \\n\\n00:00:48.239 --> 00:00:50.119 align:start position:0%\\nposition so when I run classes people\\nalways<00:00:48.480> talk<00:00:49.140> about<00:00:49.170> the<00:00:49.469> squat<00:00:49.770> how<00:00:49.980> they\\n\\n00:00:50.119 --> 00:00:50.129 align:start position:0%\\nalways talk about the squat how they\\n \\n\\n00:00:50.129 --> 00:00:51.590 align:start position:0%\\nalways talk about the squat how they\\ncan't<00:00:50.399> quite<00:00:50.610> get<00:00:50.789> their<00:00:50.969> heel<00:00:51.149> down<00:00:51.390> on<00:00:51.510> the\\n\\n00:00:51.590 --> 00:00:51.600 align:start position:0%\\ncan't quite get their heel down on the\\n \\n\\n00:00:51.600 --> 00:00:53.569 align:start position:0%\\ncan't quite get their heel down on the\\nfloor<00:00:51.840> now<00:00:52.649> the<00:00:52.739> reason<00:00:53.070> for<00:00:53.250> that<00:00:53.309> is<00:00:53.550> because\\n\\n00:00:53.569 --> 00:00:53.579 align:start position:0%\\nfloor now the reason for that is because\\n \\n\\n00:00:53.579 --> 00:00:55.220 align:start position:0%\\nfloor now the reason for that is because\\nsince<00:00:54.059> about<00:00:54.239> the<00:00:54.300> age<00:00:54.420> of<00:00:54.600> four<00:00:54.840> years<00:00:55.050> old\\n\\n00:00:55.220 --> 00:00:55.230 align:start position:0%\\nsince about the age of four years old\\n \\n\\n00:00:55.230 --> 00:00:58.099 align:start position:0%\\nsince about the age of four years old\\nyou've<00:00:55.800> been<00:00:56.010> wearing<00:00:56.219> shoes<00:00:56.809> we<00:00:57.809> are<00:00:57.960> an\\n\\n00:00:58.099 --> 00:00:58.109 align:start position:0%\\nyou've been wearing shoes we are an\\n \\n\\n00:00:58.109 --> 00:00:58.549 align:start position:0%\\nyou've been wearing shoes we are an\\nanimal\\n\\n00:00:58.549 --> 00:00:58.559 align:start position:0%\\nanimal\\n \\n\\n00:00:58.559 --> 00:01:00.229 align:start position:0%\\nanimal\\nwe're<00:00:59.070> designed<00:00:59.370> to<00:00:59.460> walk<00:00:59.640> around<00:00:59.789> on<00:01:00.059> bare\\n\\n00:01:00.229 --> 00:01:00.239 align:start position:0%\\nwe're designed to walk around on bare\\n \\n\\n00:01:00.239 --> 00:01:02.510 align:start position:0%\\nwe're designed to walk around on bare\\nfeet<00:01:00.270> and<00:01:00.690> the<00:01:01.109> reason<00:01:01.469> that<00:01:01.559> we'll<00:01:01.710> lose<00:01:01.949> our\\n\\n00:01:02.510 --> 00:01:02.520 align:start position:0%\\nfeet and the reason that we'll lose our\\n \\n\\n00:01:02.520 --> 00:01:05.210 align:start position:0%\\nfeet and the reason that we'll lose our\\nsquat<00:01:03.090> mobility<00:01:03.570> is<00:01:04.229> simply<00:01:04.619> because<00:01:04.920> we<00:01:05.040> come\\n\\n00:01:05.210 --> 00:01:05.220 align:start position:0%\\nsquat mobility is simply because we come\\n \\n\\n00:01:05.220 --> 00:01:07.370 align:start position:0%\\nsquat mobility is simply because we come\\nto<00:01:05.369> become<00:01:05.610> tight<00:01:05.970> in<00:01:06.150> our<00:01:06.210> ankles<00:01:06.659> and\\n\\n00:01:07.370 --> 00:01:07.380 align:start position:0%\\nto become tight in our ankles and\\n \\n\\n00:01:07.380 --> 00:01:09.590 align:start position:0%\\nto become tight in our ankles and\\ntherefore<00:01:08.070> you<00:01:08.670> can't<00:01:08.970> get<00:01:09.090> all<00:01:09.210> the<00:01:09.270> way<00:01:09.360> down\\n\\n00:01:09.590 --> 00:01:09.600 align:start position:0%\\ntherefore you can't get all the way down\\n \\n\\n00:01:09.600 --> 00:01:11.990 align:start position:0%\\ntherefore you can't get all the way down\\nto<00:01:09.780> the<00:01:09.869> bottom<00:01:10.010> now<00:01:11.010> all<00:01:11.369> you<00:01:11.490> need<00:01:11.640> to<00:01:11.729> do<00:01:11.880> is\\n\\n00:01:11.990 --> 00:01:12.000 align:start position:0%\\nto the bottom now all you need to do is\\n \\n\\n00:01:12.000 --> 00:01:14.030 align:start position:0%\\nto the bottom now all you need to do is\\nif<00:01:12.240> you<00:01:12.390> put<00:01:12.570> your<00:01:12.600> heels<00:01:12.990> on<00:01:13.200> a<00:01:13.229> yoga<00:01:13.590> block<00:01:13.799> or\\n\\n00:01:14.030 --> 00:01:14.040 align:start position:0%\\nif you put your heels on a yoga block or\\n \\n\\n00:01:14.040 --> 00:01:16.580 align:start position:0%\\nif you put your heels on a yoga block or\\na<00:01:14.100> book<00:01:14.400> you're<00:01:15.000> in<00:01:15.119> that<00:01:15.270> position<00:01:15.590> rather\\n\\n00:01:16.580 --> 00:01:16.590 align:start position:0%\\na book you're in that position rather\\n \\n\\n00:01:16.590 --> 00:01:19.070 align:start position:0%\\na book you're in that position rather\\nthan<00:01:16.799> that<00:01:16.890> position<00:01:17.430> that<00:01:18.330> will<00:01:18.600> take<00:01:18.900> away\\n\\n00:01:19.070 --> 00:01:19.080 align:start position:0%\\nthan that position that will take away\\n \\n\\n00:01:19.080 --> 00:01:21.050 align:start position:0%\\nthan that position that will take away\\nyour<00:01:19.320> ankle<00:01:19.680> mobility<00:01:19.770> and<00:01:20.280> allow<00:01:20.549> you<00:01:20.610> to<00:01:20.909> get\\n\\n00:01:21.050 --> 00:01:21.060 align:start position:0%\\nyour ankle mobility and allow you to get\\n \\n\\n00:01:21.060 --> 00:01:23.149 align:start position:0%\\nyour ankle mobility and allow you to get\\nall<00:01:21.210> the<00:01:21.330> way<00:01:21.450> down<00:01:21.689> now<00:01:22.500> you<00:01:22.560> can<00:01:22.770> use<00:01:22.920> a<00:01:22.950> yoga\\n\\n00:01:23.149 --> 00:01:23.159 align:start position:0%\\nall the way down now you can use a yoga\\n \\n\\n00:01:23.159 --> 00:01:24.950 align:start position:0%\\nall the way down now you can use a yoga\\nblock<00:01:23.490> or<00:01:23.640> a<00:01:23.670> book<00:01:23.880> or<00:01:24.119> two<00:01:24.299> or<00:01:24.450> three<00:01:24.479> of<00:01:24.720> them\\n\\n00:01:24.950 --> 00:01:24.960 align:start position:0%\\nblock or a book or two or three of them\\n \\n\\n00:01:24.960 --> 00:01:27.590 align:start position:0%\\nblock or a book or two or three of them\\nif<00:01:25.830> you're<00:01:26.009> a<00:01:26.070> girl<00:01:26.400> squatting<00:01:26.939> in<00:01:27.060> heels<00:01:27.299> it's\\n\\n00:01:27.590 --> 00:01:27.600 align:start position:0%\\nif you're a girl squatting in heels it's\\n \\n\\n00:01:27.600 --> 00:01:29.600 align:start position:0%\\nif you're a girl squatting in heels it's\\nso<00:01:27.869> much<00:01:28.049> easier<00:01:28.080> and<00:01:28.650> squatting<00:01:29.189> flat<00:01:29.460> on<00:01:29.549> the\\n\\n00:01:29.600 --> 00:01:29.610 align:start position:0%\\nso much easier and squatting flat on the\\n \\n\\n00:01:29.610 --> 00:01:31.490 align:start position:0%\\nso much easier and squatting flat on the\\nfloor<00:01:29.820> so<00:01:30.479> what<00:01:30.600> you<00:01:30.720> do<00:01:30.840> is<00:01:30.960> you<00:01:31.079> start<00:01:31.320> on\\n\\n00:01:31.490 --> 00:01:31.500 align:start position:0%\\nfloor so what you do is you start on\\n \\n\\n00:01:31.500 --> 00:01:33.319 align:start position:0%\\nfloor so what you do is you start on\\nwhatever<00:01:31.710> angle<00:01:32.220> you're<00:01:32.400> comfortable<00:01:32.970> in<00:01:33.090> and\\n\\n00:01:33.319 --> 00:01:33.329 align:start position:0%\\nwhatever angle you're comfortable in and\\n \\n\\n00:01:33.329 --> 00:01:36.260 align:start position:0%\\nwhatever angle you're comfortable in and\\nyou<00:01:34.110> work<00:01:34.350> your<00:01:34.560> way<00:01:34.710> down<00:01:34.740> to<00:01:35.549> becoming<00:01:36.000> flat\\n\\n00:01:36.260 --> 00:01:36.270 align:start position:0%\\nyou work your way down to becoming flat\\n \\n\\n00:01:36.270 --> 00:01:39.679 align:start position:0%\\nyou work your way down to becoming flat\\non<00:01:36.360> the<00:01:36.450> floor<00:01:36.659> so<00:01:37.409> really<00:01:37.740> a<00:01:37.770> squat<00:01:38.490> is<00:01:38.939> just<00:01:39.509> a\\n\\n00:01:39.679 --> 00:01:39.689 align:start position:0%\\non the floor so really a squat is just a\\n \\n\\n00:01:39.689 --> 00:01:42.469 align:start position:0%\\non the floor so really a squat is just a\\ntest<00:01:40.079> of<00:01:40.350> your<00:01:41.070> ankles<00:01:41.549> if<00:01:41.880> you've<00:01:42.090> got<00:01:42.299> good\\n\\n00:01:42.469 --> 00:01:42.479 align:start position:0%\\ntest of your ankles if you've got good\\n \\n\\n00:01:42.479 --> 00:01:45.499 align:start position:0%\\ntest of your ankles if you've got good\\nankles<00:01:42.869> you're<00:01:43.590> great<00:01:43.890> at<00:01:44.070> squatting<00:01:44.549> and<00:01:44.759> if\\n\\n00:01:45.499 --> 00:01:45.509 align:start position:0%\\nankles you're great at squatting and if\\n \\n\\n00:01:45.509 --> 00:01:47.420 align:start position:0%\\nankles you're great at squatting and if\\nyour<00:01:45.659> ankles<00:01:46.020> have<00:01:46.110> become<00:01:46.409> tight<00:01:46.680> then<00:01:47.009> squat\\n\\n00:01:47.420 --> 00:01:47.430 align:start position:0%\\nyour ankles have become tight then squat\\n \\n\\n00:01:47.430 --> 00:01:49.609 align:start position:0%\\nyour ankles have become tight then squat\\nis<00:01:47.549> really<00:01:47.820> difficult<00:01:48.060> and<00:01:48.450> hard<00:01:48.689> but<00:01:49.350> like\\n\\n00:01:49.609 --> 00:01:49.619 align:start position:0%\\nis really difficult and hard but like\\n \\n\\n00:01:49.619 --> 00:01:51.499 align:start position:0%\\nis really difficult and hard but like\\nevery<00:01:50.070> other<00:01:50.220> muscle<00:01:50.490> and<00:01:50.880> joint<00:01:51.180> in<00:01:51.390> the<00:01:51.479> body\\n\\n00:01:51.499 --> 00:01:51.509 align:start position:0%\\nevery other muscle and joint in the body\\n \\n\\n00:01:51.509 --> 00:01:54.080 align:start position:0%\\nevery other muscle and joint in the body\\nis<00:01:51.960> trainable<00:01:52.680> and<00:01:52.920> you've<00:01:53.399> got<00:01:53.549> time<00:01:53.820> to<00:01:53.970> get\\n\\n00:01:54.080 --> 00:01:54.090 align:start position:0%\\nis trainable and you've got time to get\\n \\n\\n00:01:54.090 --> 00:01:56.870 align:start position:0%\\nis trainable and you've got time to get\\nit<00:01:54.210> back<00:01:54.710> if<00:01:55.710> you<00:01:55.770> do<00:01:56.070> something<00:01:56.369> for<00:01:56.549> 10\\n\\n00:01:56.870 --> 00:01:56.880 align:start position:0%\\nit back if you do something for 10\\n \\n\\n00:01:56.880 --> 00:01:58.850 align:start position:0%\\nit back if you do something for 10\\nminutes<00:01:57.149> every<00:01:57.360> day<00:01:57.540> kind<00:01:58.259> of<00:01:58.350> like<00:01:58.560> brushing\\n\\n00:01:58.850 --> 00:01:58.860 align:start position:0%\\nminutes every day kind of like brushing\\n \\n\\n00:01:58.860 --> 00:02:01.160 align:start position:0%\\nminutes every day kind of like brushing\\nyour<00:01:59.070> teeth<00:01:59.280> it<00:02:00.000> gets<00:02:00.240> into<00:02:00.509> a<00:02:00.540> habit<00:02:00.930> and<00:02:01.110> you\\n\\n00:02:01.160 --> 00:02:01.170 align:start position:0%\\nyour teeth it gets into a habit and you\\n \\n\\n00:02:01.170 --> 00:02:02.719 align:start position:0%\\nyour teeth it gets into a habit and you\\ngo<00:02:01.320> you<00:02:01.380> get<00:02:01.619> up<00:02:01.740> you<00:02:01.890> brush<00:02:02.070> your<00:02:02.219> teeth<00:02:02.369> go<00:02:02.670> to\\n\\n00:02:02.719 --> 00:02:02.729 align:start position:0%\\ngo you get up you brush your teeth go to\\n \\n\\n00:02:02.729 --> 00:02:05.270 align:start position:0%\\ngo you get up you brush your teeth go to\\nveggie<00:02:03.000> brush<00:02:03.240> your<00:02:03.390> teeth<00:02:03.540> so<00:02:04.290> by<00:02:04.740> doing<00:02:04.799> 10\\n\\n00:02:05.270 --> 00:02:05.280 align:start position:0%\\nveggie brush your teeth so by doing 10\\n \\n\\n00:02:05.280 --> 00:02:06.080 align:start position:0%\\nveggie brush your teeth so by doing 10\\nminutes<00:02:05.520> every<00:02:05.820> day\\n\\n00:02:06.080 --> 00:02:06.090 align:start position:0%\\nminutes every day\\n \\n\\n00:02:06.090 --> 00:02:08.690 align:start position:0%\\nminutes every day\\nyou'll<00:02:06.810> get<00:02:06.990> into<00:02:07.109> the<00:02:07.259> habit<00:02:07.469> by<00:02:08.399> playing\\n\\n00:02:08.690 --> 00:02:08.700 align:start position:0%\\nyou'll get into the habit by playing\\n \\n\\n00:02:08.700 --> 00:02:10.759 align:start position:0%\\nyou'll get into the habit by playing\\nusing<00:02:09.420> just<00:02:09.750> that<00:02:09.899> short<00:02:10.140> of<00:02:10.289> a<00:02:10.379> short<00:02:10.590> amount\\n\\n00:02:10.759 --> 00:02:10.769 align:start position:0%\\nusing just that short of a short amount\\n \\n\\n00:02:10.769 --> 00:02:12.559 align:start position:0%\\nusing just that short of a short amount\\nof<00:02:10.920> time<00:02:11.129> you<00:02:11.310> can<00:02:11.340> really<00:02:11.700> focus<00:02:11.910> on<00:02:12.239> how<00:02:12.390> your\\n\\n00:02:12.559 --> 00:02:12.569 align:start position:0%\\nof time you can really focus on how your\\n \\n\\n00:02:12.569 --> 00:02:13.520 align:start position:0%\\nof time you can really focus on how your\\nbody's<00:02:12.870> anemic\\n\\n00:02:13.520 --> 00:02:13.530 align:start position:0%\\nbody's anemic\\n \\n\\n00:02:13.530 --> 00:02:15.050 align:start position:0%\\nbody's anemic\\nnow<00:02:14.010> some<00:02:14.220> of<00:02:14.280> the<00:02:14.340> exercises<00:02:14.790> that<00:02:14.940> I\\n\\n00:02:15.050 --> 00:02:15.060 align:start position:0%\\nnow some of the exercises that I\\n \\n\\n00:02:15.060 --> 00:02:16.699 align:start position:0%\\nnow some of the exercises that I\\nrecommend<00:02:15.540> that<00:02:15.599> you<00:02:15.780> do<00:02:15.989> within<00:02:16.290> those<00:02:16.440> 10\\n\\n00:02:16.699 --> 00:02:16.709 align:start position:0%\\nrecommend that you do within those 10\\n \\n\\n00:02:16.709 --> 00:02:20.270 align:start position:0%\\nrecommend that you do within those 10\\nminutes<00:02:16.860> first<00:02:17.849> one<00:02:18.120> is<00:02:18.330> the<00:02:18.720> squat<00:02:19.280> sitting\\n\\n00:02:20.270 --> 00:02:20.280 align:start position:0%\\nminutes first one is the squat sitting\\n \\n\\n00:02:20.280 --> 00:02:22.759 align:start position:0%\\nminutes first one is the squat sitting\\nin<00:02:20.489> a<00:02:20.580> squat<00:02:20.940> not<00:02:21.480> bobbing<00:02:22.019> up<00:02:22.080> and<00:02:22.230> down<00:02:22.349> just\\n\\n00:02:22.759 --> 00:02:22.769 align:start position:0%\\nin a squat not bobbing up and down just\\n \\n\\n00:02:22.769 --> 00:02:24.589 align:start position:0%\\nin a squat not bobbing up and down just\\nbeing<00:02:23.010> able<00:02:23.190> to<00:02:23.370> function<00:02:24.000> to<00:02:24.209> sit<00:02:24.420> in<00:02:24.540> the\\n\\n00:02:24.589 --> 00:02:24.599 align:start position:0%\\nbeing able to function to sit in the\\n \\n\\n00:02:24.599 --> 00:02:27.229 align:start position:0%\\nbeing able to function to sit in the\\nsquat<00:02:24.930> something<00:02:25.709> again<00:02:26.010> is<00:02:26.250> innate<00:02:26.400> to<00:02:26.819> us<00:02:26.849> as\\n\\n00:02:27.229 --> 00:02:27.239 align:start position:0%\\nsquat something again is innate to us as\\n \\n\\n00:02:27.239 --> 00:02:28.729 align:start position:0%\\nsquat something again is innate to us as\\nhuman<00:02:27.569> beings<00:02:27.599> and<00:02:28.049> something<00:02:28.440> that<00:02:28.530> you<00:02:28.650> are\\n\\n00:02:28.729 --> 00:02:28.739 align:start position:0%\\nhuman beings and something that you are\\n \\n\\n00:02:28.739 --> 00:02:31.670 align:start position:0%\\nhuman beings and something that you are\\nable<00:02:28.890> to<00:02:29.220> do<00:02:29.910> as<00:02:30.090> a<00:02:30.120> child<00:02:30.209> so<00:02:30.870> spend<00:02:31.440> 10\\n\\n00:02:31.670 --> 00:02:31.680 align:start position:0%\\nable to do as a child so spend 10\\n \\n\\n00:02:31.680 --> 00:02:34.640 align:start position:0%\\nable to do as a child so spend 10\\nminutes<00:02:32.239> every<00:02:33.239> day<00:02:33.450> getting<00:02:34.110> your<00:02:34.410> squat\\n\\n00:02:34.640 --> 00:02:34.650 align:start position:0%\\nminutes every day getting your squat\\n \\n\\n00:02:34.650 --> 00:02:37.360 align:start position:0%\\nminutes every day getting your squat\\nback\\n\\n00:02:37.360 --> 00:02:37.370 align:start position:0%\\n \\n \\n\\n00:02:37.370 --> 00:02:51.610 align:start position:0%\\n \\n[Music]\\n\\n00:02:51.610 --> 00:02:51.620 align:start position:0%\\n \\n \\n\\n00:02:51.620 --> 00:02:56.310 align:start position:0%\\n \\n[Music]\\n\\n\", transcript=\"00:00:02\\nhi I'm Roger Frampton and I'm a movement\\n00:00:05\\ncoach from London and I'm author of the\\n00:00:07\\nbook the flexible body so this position\\n00:00:10\\nis the squat\\n00:00:11\\nmost people when I talk about the squat\\n00:00:13\\nthinking when I'm forming up and down\\n00:00:14\\nexercise it works your legs and bar\\n00:00:16\\nwell the squat is actually a position\\n00:00:19\\nthat we're designed to defecate in every\\n00:00:22\\nkid sits and rests in this position and\\n00:00:25\\nif we look at Western people I think\\n00:00:28\\nmost people end up sitting up on their\\n00:00:31\\ntoes more in this position while I'm\\n00:00:33\\nadvising is that you get this position\\n00:00:35\\nback not for an exercise necessarily not\\n00:00:38\\nbecause of fitness or to be fitter but\\n00:00:40\\njust because you're designed to do it\\n00:00:43\\nthis here is just the human resting\\n00:00:45\\nposition so when I run classes people\\n00:00:48\\nalways talk about the squat how they\\n00:00:50\\ncan't quite get their heel down on the\\n00:00:51\\nfloor now the reason for that is because\\n00:00:53\\nsince about the age of four years old\\n00:00:55\\nyou've been wearing shoes we are an\\n00:00:58\\nanimal\\nwe're designed to walk around on bare\\n00:01:00\\nfeet and the reason that we'll lose our\\n00:01:02\\nsquat mobility is simply because we come\\n00:01:05\\nto become tight in our ankles and\\n00:01:07\\ntherefore you can't get all the way down\\n00:01:09\\nto the bottom now all you need to do is\\n00:01:11\\n00:01:12\\nif you put your heels on a yoga block or\\n00:01:14\\na book you're in that position rather\\n00:01:16\\nthan that position that will take away\\n00:01:19\\nyour ankle mobility and allow you to get\\n00:01:21\\nall the way down now you can use a yoga\\n00:01:23\\nblock or a book or two or three of them\\n00:01:24\\nif you're a girl squatting in heels it's\\n00:01:27\\nso much easier and squatting flat on the\\n00:01:29\\nfloor so what you do is you start on\\n00:01:31\\nwhatever angle you're comfortable in and\\n00:01:33\\nyou work your way down to becoming flat\\n00:01:36\\non the floor so really a squat is just a\\n00:01:39\\ntest of your ankles if you've got good\\n00:01:42\\nankles you're great at squatting and if\\n00:01:45\\nyour ankles have become tight then squat\\n00:01:47\\nis really difficult and hard but like\\n00:01:49\\nevery other muscle and joint in the body\\n00:01:51\\nis trainable and you've got time to get\\n00:01:54\\nit back if you do something for 10\\n00:01:56\\nminutes every day kind of like brushing\\n00:01:58\\nyour teeth it gets into a habit and you\\n00:02:01\\ngo you get up you brush your teeth go to\\n00:02:02\\nveggie brush your teeth so by doing 10\\n00:02:05\\nminutes every day\\n00:02:06\\nyou'll get into the habit by playing\\n00:02:08\\nusing just that short of a short amount\\n00:02:10\\nof time you can really focus on how your\\n00:02:12\\nbody's anemic\\n00:02:13\\nnow some of the exercises that I\\n00:02:15\\nrecommend that you do within those 10\\n00:02:16\\nminutes first one is the squat sitting\\n00:02:20\\nin a squat not bobbing up and down just\\n00:02:22\\nbeing able to function to sit in the\\n00:02:24\\nsquat something again is innate to us as\\n00:02:27\\nhuman beings and something that you are\\n00:02:28\\nable to do as a child so spend 10\\n00:02:31\\nminutes every day getting your squat\\n00:02:34\\nback\\n00:02:37\\n[Music]\\n00:02:51\")]}}\n", - "Frame idx 0\n", - "Frame idx 24\n", - "Frame idx 48\n", - "Frame idx 72\n", - "Frame idx 96\n", - "Frame idx 120\n", - "Frame idx 144\n", - "Frame idx 168\n", - "Frame idx 192\n", - "Frame idx 216\n", - "Frame idx 240\n", - "Frame idx 264\n", - "Frame idx 288\n", - "Frame idx 312\n", - "Frame idx 336\n", - "Frame idx 360\n", - "Frame idx 384\n", - "Frame idx 408\n", - "Frame idx 432\n", - "Frame idx 456\n", - "Frame idx 480\n", - "Frame idx 504\n", - "Frame idx 528\n", - "Frame idx 552\n", - "Frame idx 576\n", - "Frame idx 600\n", + "dowload happened\n", "Init model complete\n", - "Frame idx 624\n", - "Frame idx 648\n", - "Frame idx 672\n", - "Frame idx 696\n", - "Frame idx 720\n", - "Frame idx 744\n", - "Frame idx 768\n", - "Frame idx 792\n", - "Frame idx 816\n", - "Frame idx 840\n", - "Frame idx 864\n", - "Frame idx 888\n", - "Frame idx 912\n", - "Frame idx 936\n", - "Frame idx 960\n", - "Frame idx 984\n", - "Frame idx 1008\n", - "Frame idx 1032\n", - "Frame idx 1056\n", - "Frame idx 1080\n", - "Frame idx 1104\n", - "Frame idx 1128\n", - "Frame idx 1152\n", - "Frame idx 0\n", - "Frame idx 25\n", - "Frame idx 50\n", - "Frame idx 75\n", - "Frame idx 100\n", - "Frame idx 125\n", - "Frame idx 150\n", - "Frame idx 175\n", - "Frame idx 200\n", - "Frame idx 225\n", - "Frame idx 250\n", - "Frame idx 275\n", - "Frame idx 300\n", - "Frame idx 325\n", - "Frame idx 350\n", - "Frame idx 375\n", - "Frame idx 400\n", - "Frame idx 425\n", - "Frame idx 450\n", - "Frame idx 475\n", - "Frame idx 500\n", - "Frame idx 525\n", - "Frame idx 550\n", - "Frame idx 575\n", - "Frame idx 600\n", - "Frame idx 625\n", - "Frame idx 650\n", - "Frame idx 675\n", - "Frame idx 700\n", - "Frame idx 725\n", - "Frame idx 750\n", - "Frame idx 775\n", - "Frame idx 800\n", - "Frame idx 825\n", - "Frame idx 850\n", - "Frame idx 875\n", - "Frame idx 900\n", - "Frame idx 925\n", - "Frame idx 950\n", - "Frame idx 975\n", - "Frame idx 1000\n", - "Frame idx 1025\n", - "Frame idx 1050\n", - "Frame idx 1075\n", - "Frame idx 1100\n", - "Frame idx 1125\n", - "Frame idx 1150\n", - "Frame idx 1175\n", - "Frame idx 1200\n", - "Frame idx 1225\n", - "Frame idx 1250\n", - "Frame idx 1275\n", - "Frame idx 1300\n", - "Frame idx 1325\n", - "Frame idx 1350\n", - "Frame idx 1375\n", - "Frame idx 1400\n", - "Frame idx 1425\n", - "Frame idx 1450\n", - "Frame idx 1475\n", - "Frame idx 1500\n", - "Frame idx 1525\n", - "Frame idx 1550\n", - "Frame idx 1575\n", - "Frame idx 1600\n", - "Frame idx 1625\n", - "Frame idx 1650\n", - "Frame idx 1675\n", - "Frame idx 1700\n", - "Frame idx 1725\n", - "Frame idx 1750\n", - "Frame idx 1775\n", - "Frame idx 1800\n", - "Frame idx 1825\n", - "Frame idx 1850\n", - "Frame idx 1875\n", - "Frame idx 1900\n", - "Frame idx 1925\n", - "Frame idx 1950\n", - "Frame idx 1975\n", - "Frame idx 2000\n", - "Frame idx 2025\n", - "Frame idx 2050\n", - "Frame idx 2075\n", - "Frame idx 2100\n", - "Frame idx 2125\n", - "Frame idx 2150\n", - "Frame idx 2175\n", - "Frame idx 2200\n", - "Frame idx 2225\n", - "Frame idx 2250\n", - "Frame idx 2275\n", - "Frame idx 2300\n", - "Frame idx 2325\n", - "Frame idx 2350\n", - "Frame idx 2375\n", - "Frame idx 2400\n", - "Frame idx 2425\n", - "Frame idx 2450\n", - "Frame idx 2475\n", - "Frame idx 2500\n", - "Frame idx 2525\n", - "Frame idx 2550\n", - "Frame idx 2575\n", - "Frame idx 2600\n", - "Frame idx 2625\n", - "Frame idx 2650\n", - "Frame idx 2675\n", - "Frame idx 2700\n", - "Frame idx 2725\n", - "Frame idx 2750\n", - "Frame idx 2775\n", - "Frame idx 2800\n", - "Frame idx 2825\n", - "Frame idx 2850\n", - "Frame idx 2875\n", - "Frame idx 2900\n", - "Frame idx 2925\n", - "Frame idx 2950\n", - "Frame idx 2975\n", - "Frame idx 3000\n", - "Frame idx 3025\n", - "Frame idx 3050\n", - "Frame idx 3075\n", - "Frame idx 3100\n", - "Frame idx 3125\n", - "Frame idx 3150\n", - "Frame idx 3175\n", - "Frame idx 3200\n", - "Frame idx 3225\n", - "Frame idx 3250\n", - "Frame idx 3275\n", - "Frame idx 3300\n", - "Frame idx 3325\n", - "Frame idx 3350\n", - "Frame idx 3375\n", - "Frame idx 3400\n", - "Frame idx 3425\n", - "Frame idx 3450\n", - "Frame idx 3475\n", - "Frame idx 3500\n", - "Frame idx 3525\n", - "Frame idx 3550\n", - "Frame idx 3575\n", - "Frame idx 3600\n", - "Frame idx 3625\n", - "Frame idx 3650\n", - "Frame idx 3675\n", - "Frame idx 3700\n", - "Frame idx 3725\n", - "Frame idx 3750\n", - "Frame idx 3775\n", - "Frame idx 3800\n", - "Frame idx 3825\n", - "Frame idx 3850\n", - "Frame idx 3875\n", - "Frame idx 3900\n", - "Frame idx 3925\n", - "Frame idx 3950\n", - "Frame idx 3975\n", - "Frame idx 4000\n", - "Frame idx 4025\n", - "Frame idx 4050\n", - "Frame idx 4075\n", - "Frame idx 4100\n", - "Frame idx 4125\n", - "Frame idx 4150\n", - "Frame idx 4175\n", - "Frame idx 4200\n", - "Frame idx 4225\n", - "Frame idx 4250\n", - "Frame idx 4275\n", - "Frame idx 4300\n", - "Frame idx 4325\n", - "Frame idx 4350\n", - "Frame idx 4375\n", "All frames processed\n", "Dataframe created\n", " video_id frame_idx probs\n", @@ -998,7 +880,10 @@ "[225 rows x 3 columns]\n", "Segments for video IB_icWRzi4E: [(0, 5), (9, 24), (29, 45), (49, 53), (62, 66), (103, 109), (138, 147)]\n", "Segments for video xqvCmoLULNY: [(1, 44)]\n", - "{'detect_segments': {'segment_infos': [SegmentInfo(start_timestamp='00:00:00.000', end_timestamp='00:00:05.000', fps=25.0, video_id='IB_icWRzi4E'), SegmentInfo(start_timestamp='00:00:09.000', end_timestamp='00:00:24.000', fps=25.0, video_id='IB_icWRzi4E'), SegmentInfo(start_timestamp='00:00:29.000', end_timestamp='00:00:45.000', fps=25.0, video_id='IB_icWRzi4E'), SegmentInfo(start_timestamp='00:00:49.000', end_timestamp='00:00:53.000', fps=25.0, video_id='IB_icWRzi4E'), SegmentInfo(start_timestamp='00:01:02.000', end_timestamp='00:01:06.000', fps=25.0, video_id='IB_icWRzi4E'), SegmentInfo(start_timestamp='00:01:43.000', end_timestamp='00:01:49.000', fps=25.0, video_id='IB_icWRzi4E'), SegmentInfo(start_timestamp='00:02:18.000', end_timestamp='00:02:27.000', fps=25.0, video_id='IB_icWRzi4E'), SegmentInfo(start_timestamp='00:00:01.000', end_timestamp='00:00:44.000', fps=25.0, video_id='xqvCmoLULNY')]}}\n" + "{'detect_segments': {'segment_infos': [SegmentInfo(start_timestamp='00:00:00.000', end_timestamp='00:00:05.000', fps=25.0, video_id='IB_icWRzi4E'), SegmentInfo(start_timestamp='00:00:09.000', end_timestamp='00:00:24.000', fps=25.0, video_id='IB_icWRzi4E'), SegmentInfo(start_timestamp='00:00:29.000', end_timestamp='00:00:45.000', fps=25.0, video_id='IB_icWRzi4E'), SegmentInfo(start_timestamp='00:00:49.000', end_timestamp='00:00:53.000', fps=25.0, video_id='IB_icWRzi4E'), SegmentInfo(start_timestamp='00:01:02.000', end_timestamp='00:01:06.000', fps=25.0, video_id='IB_icWRzi4E'), SegmentInfo(start_timestamp='00:01:43.000', end_timestamp='00:01:49.000', fps=25.0, video_id='IB_icWRzi4E'), SegmentInfo(start_timestamp='00:02:18.000', end_timestamp='00:02:27.000', fps=25.0, video_id='IB_icWRzi4E'), SegmentInfo(start_timestamp='00:00:01.000', end_timestamp='00:00:44.000', fps=25.0, video_id='xqvCmoLULNY')]}}\n", + "extract_clues happened\n", + "['{\"squats_probability\": \"low\", \"squats_technique_correctness\": null, \"squats_feedback\": {\"right\": null, \"wrong\": null, \"correction\": null}}', '{\"squats_probability\": \"high\", \"squats_technique_correctness\": null, \"squats_feedback\": {\"right\": null, \"wrong\": null, \"correction\": null}}', '{\"squats_probability\": \"high\", \"squats_technique_correctness\": \"incorrect\", \"squats_feedback\": {\"right\": \"The segment emphasizes the squat as a natural human resting position, which is a correct understanding.\", \"wrong\": \"The segment mentions a common issue where people can\\'t get their heels down on the floor during a squat.\", \"correction\": \"Focus on ankle mobility exercises and practice squatting with support to gradually improve heel placement.\"}}', '{\"squats_probability\": \"unknown\", \"squats_technique_correctness\": null, \"squats_feedback\": {\"right\": null, \"wrong\": null, \"correction\": null}}', '{\"squats_probability\": \"high\", \"squats_technique_correctness\": null, \"squats_feedback\": {\"right\": null, \"wrong\": null, \"correction\": null}}', '{\"squats_probability\": \"high\", \"squats_technique_correctness\": \"incorrect\", \"squats_feedback\": {\"right\": \"The segment emphasizes the importance of ankle mobility in squatting and suggests that it can be improved with training.\", \"wrong\": \"The segment likely demonstrates difficulty in squatting due to tight ankles.\", \"correction\": \"To improve squat depth and form, consider placing your heels on a yoga block or a book to compensate for limited ankle mobility.\"}}', '{\"squats_probability\": \"high\", \"squats_technique_correctness\": \"unknown\", \"squats_feedback\": {\"right\": \"The segment emphasizes the importance of stability and function in the squat position, highlighting that squatting is a natural human ability.\", \"wrong\": null, \"correction\": \"Practice sitting in a squat position daily for 10 minutes to regain natural squatting ability.\"}}', '{\"squats_probability\": \"high\", \"squats_technique_correctness\": \"correct\", \"squats_feedback\": {\"right\": \"The segment provides a detailed, step-by-step guide on how to perform a squat correctly, including specific instructions on foot placement, arm positioning, weight distribution, and movement.\", \"wrong\": null, \"correction\": null}}']\n", + "{'gen_annotations': {'annotations': ['{\"squats_probability\": \"low\", \"squats_technique_correctness\": null, \"squats_feedback\": {\"right\": null, \"wrong\": null, \"correction\": null}}', '{\"squats_probability\": \"high\", \"squats_technique_correctness\": null, \"squats_feedback\": {\"right\": null, \"wrong\": null, \"correction\": null}}', '{\"squats_probability\": \"high\", \"squats_technique_correctness\": \"incorrect\", \"squats_feedback\": {\"right\": \"The segment emphasizes the squat as a natural human resting position, which is a correct understanding.\", \"wrong\": \"The segment mentions a common issue where people can\\'t get their heels down on the floor during a squat.\", \"correction\": \"Focus on ankle mobility exercises and practice squatting with support to gradually improve heel placement.\"}}', '{\"squats_probability\": \"unknown\", \"squats_technique_correctness\": null, \"squats_feedback\": {\"right\": null, \"wrong\": null, \"correction\": null}}', '{\"squats_probability\": \"high\", \"squats_technique_correctness\": null, \"squats_feedback\": {\"right\": null, \"wrong\": null, \"correction\": null}}', '{\"squats_probability\": \"high\", \"squats_technique_correctness\": \"incorrect\", \"squats_feedback\": {\"right\": \"The segment emphasizes the importance of ankle mobility in squatting and suggests that it can be improved with training.\", \"wrong\": \"The segment likely demonstrates difficulty in squatting due to tight ankles.\", \"correction\": \"To improve squat depth and form, consider placing your heels on a yoga block or a book to compensate for limited ankle mobility.\"}}', '{\"squats_probability\": \"high\", \"squats_technique_correctness\": \"unknown\", \"squats_feedback\": {\"right\": \"The segment emphasizes the importance of stability and function in the squat position, highlighting that squatting is a natural human ability.\", \"wrong\": null, \"correction\": \"Practice sitting in a squat position daily for 10 minutes to regain natural squatting ability.\"}}', '{\"squats_probability\": \"high\", \"squats_technique_correctness\": \"correct\", \"squats_feedback\": {\"right\": \"The segment provides a detailed, step-by-step guide on how to perform a squat correctly, including specific instructions on foot placement, arm positioning, weight distribution, and movement.\", \"wrong\": null, \"correction\": null}}']}}\n" ] } ], @@ -1011,12 +896,17 @@ " },\n", " thread,\n", "):\n", - " print(s)" + " if \"download\" in s:\n", + " print(\"dowload happened\")\n", + " elif \"extract_clues\" in s:\n", + " print(\"extract_clues happened\")\n", + " else:\n", + " print(s)" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -1035,10 +925,26 @@ " SegmentInfo(start_timestamp='00:01:02.000', end_timestamp='00:01:06.000', fps=25.0, video_id='IB_icWRzi4E'),\n", " SegmentInfo(start_timestamp='00:01:43.000', end_timestamp='00:01:49.000', fps=25.0, video_id='IB_icWRzi4E'),\n", " SegmentInfo(start_timestamp='00:02:18.000', end_timestamp='00:02:27.000', fps=25.0, video_id='IB_icWRzi4E'),\n", - " SegmentInfo(start_timestamp='00:00:01.000', end_timestamp='00:00:44.000', fps=25.0, video_id='xqvCmoLULNY')]}" + " SegmentInfo(start_timestamp='00:00:01.000', end_timestamp='00:00:44.000', fps=25.0, video_id='xqvCmoLULNY')],\n", + " 'clues': [SegmentWithClueInfo(start_timestamp='00:00:00.000', end_timestamp='00:00:05.000', segment_annotation=SegmentAnnotation(local_clues=[LocalClue(id='LC1', quote=\"hi I'm Roger Frampton and I'm a movement\", quote_timestamp_start='00:00:02', quote_timestamp_end='00:00:05', clue='Introduction of the speaker, Roger Frampton, who is a movement coach.')], global_clues=[GlobalClue(id='GC1', quote=\"hi I'm Roger Frampton and I'm a movement coach from London and I'm author of the book the flexible body\", quote_timestamp_start='00:00:02', quote_timestamp_end='00:00:07', clue='Introduction of Roger Frampton as a movement coach and author.', relevance_to_segment=\"This provides context about the speaker's expertise and credibility, which is relevant to understanding the importance of the information presented in the segment.\")], logical_inferences=[LogicalInference(id='LI1', description='Introduction of the Speaker', details='The segment serves as an introduction to Roger Frampton, establishing his credentials and setting the stage for the subsequent discussion on squats.')])),\n", + " SegmentWithClueInfo(start_timestamp='00:00:09.000', end_timestamp='00:00:24.000', segment_annotation=SegmentAnnotation(local_clues=[LocalClue(id='LC1', quote='so this position is the squat', quote_timestamp_start='00:00:10', quote_timestamp_end='00:00:11', clue='Introduction of the squat position.'), LocalClue(id='LC2', quote=\"most people when I talk about the squat thinking when I'm forming up and down exercise it works your legs and bar\", quote_timestamp_start='00:00:11', quote_timestamp_end='00:00:16', clue='Common misconception about the squat being just an up-and-down exercise.'), LocalClue(id='LC3', quote=\"well the squat is actually a position that we're designed to defecate in every\", quote_timestamp_start='00:00:16', quote_timestamp_end='00:00:19', clue=\"Explanation of the squat's original purpose for defecation.\"), LocalClue(id='LC4', quote='kid sits and rests in this position and if we look at Western people I think most people end up sitting up on their toes more in this position', quote_timestamp_start='00:00:22', quote_timestamp_end='00:00:25', clue='Comparison of how children and Western adults sit in the squat position.')], global_clues=[GlobalClue(id='GC1', quote=\"so this position is the squat most people when I talk about the squat thinking when I'm forming up and down exercise it works your legs and bar\", quote_timestamp_start='00:00:10', quote_timestamp_end='00:00:16', clue='Introduction of the squat position and common misconceptions.', relevance_to_segment='This provides context for the segment by explaining what the squat is and addressing common misconceptions.'), GlobalClue(id='GC2', quote=\"well the squat is actually a position that we're designed to defecate in every kid sits and rests in this position\", quote_timestamp_start='00:00:16', quote_timestamp_end='00:00:22', clue=\"Explanation of the squat's original purpose and how children naturally sit in this position.\", relevance_to_segment='This provides a historical and functional context for the squat, which is relevant to understanding its importance.')], logical_inferences=[LogicalInference(id='LI1', description=\"Explanation of Squat's Purpose\", details='The segment explains the original purpose of the squat position for defecation and how it is a natural resting position for children, contrasting it with the common misconception of it being just an exercise.')])),\n", + " SegmentWithClueInfo(start_timestamp='00:00:29.000', end_timestamp='00:00:45.000', segment_annotation=SegmentAnnotation(local_clues=[LocalClue(id='LC1', quote=\"not for an exercise necessarily not because of fitness or to be fitter but just because you're designed to do it\", quote_timestamp_start='00:00:38', quote_timestamp_end='00:00:40', clue='Emphasis on the squat being a natural position rather than just an exercise.'), LocalClue(id='LC2', quote=\"this here is just the human resting position so when I run classes people always talk about the squat how they can't quite get their heel down on the floor\", quote_timestamp_start='00:00:43', quote_timestamp_end='00:00:45', clue='Description of the squat as a human resting position and common issue of not being able to get heels down.')], global_clues=[GlobalClue(id='GC1', quote=\"this here is just the human resting position so when I run classes people always talk about the squat how they can't quite get their heel down on the floor\", quote_timestamp_start='00:00:43', quote_timestamp_end='00:00:45', clue='Description of the squat as a human resting position and common issue of not being able to get heels down.', relevance_to_segment='This provides context for the segment by explaining the squat as a natural resting position and a common issue people face.')], logical_inferences=[LogicalInference(id='LI1', description='Natural Resting Position', details='The segment emphasizes that the squat is a natural human resting position and not just an exercise, highlighting a common issue people face with getting their heels down.')])),\n", + " SegmentWithClueInfo(start_timestamp='00:00:49.000', end_timestamp='00:00:53.000', segment_annotation=SegmentAnnotation(local_clues=[LocalClue(id='LC1', quote=\"since about the age of four years old you've been wearing shoes\", quote_timestamp_start='00:00:53', quote_timestamp_end='00:00:55', clue='Introduction of the idea that wearing shoes from a young age affects squat mobility.')], global_clues=[GlobalClue(id='GC1', quote=\"since about the age of four years old you've been wearing shoes\", quote_timestamp_start='00:00:53', quote_timestamp_end='00:00:55', clue='Introduction of the idea that wearing shoes from a young age affects squat mobility.', relevance_to_segment='This provides context for the segment by explaining how wearing shoes from a young age affects squat mobility.')], logical_inferences=[LogicalInference(id='LI1', description='Impact of Wearing Shoes', details='The segment introduces the idea that wearing shoes from a young age affects squat mobility, setting the stage for further explanation.')])),\n", + " SegmentWithClueInfo(start_timestamp='00:01:02.000', end_timestamp='00:01:06.000', segment_annotation=SegmentAnnotation(local_clues=[LocalClue(id='LC1', quote=\"we are an animal we're designed to walk around on bare feet and the reason that we'll lose our squat mobility is simply because we come to become tight in our ankles\", quote_timestamp_start='00:01:02', quote_timestamp_end='00:01:07', clue='Explanation of how wearing shoes leads to tight ankles and loss of squat mobility.')], global_clues=[GlobalClue(id='GC1', quote=\"we are an animal we're designed to walk around on bare feet and the reason that we'll lose our squat mobility is simply because we come to become tight in our ankles\", quote_timestamp_start='00:01:02', quote_timestamp_end='00:01:07', clue='Explanation of how wearing shoes leads to tight ankles and loss of squat mobility.', relevance_to_segment='This provides context for the segment by explaining how wearing shoes leads to tight ankles and loss of squat mobility.')], logical_inferences=[LogicalInference(id='LI1', description='Loss of Squat Mobility', details='The segment explains that wearing shoes leads to tight ankles and loss of squat mobility, emphasizing the importance of walking barefoot.')])),\n", + " SegmentWithClueInfo(start_timestamp='00:01:43.000', end_timestamp='00:01:49.000', segment_annotation=SegmentAnnotation(local_clues=[LocalClue(id='LC1', quote='if your ankles have become tight then squat is really difficult and hard', quote_timestamp_start='00:01:45', quote_timestamp_end='00:01:47', clue='This phrase explains the difficulty of squatting due to tight ankles, which is likely demonstrated in this segment.'), LocalClue(id='LC2', quote='but like every other muscle and joint in the body is trainable', quote_timestamp_start='00:01:47', quote_timestamp_end='00:01:49', clue='This phrase suggests that despite the difficulty, ankle mobility can be improved with training.')], global_clues=[GlobalClue(id='GC1', quote=\"if you put your heels on a yoga block or a book you're in that position rather than that position that will take away your ankle mobility and allow you to get all the way down\", quote_timestamp_start='00:01:12', quote_timestamp_end='00:01:16', clue='This quote explains a method to improve squat depth by using a yoga block or book to compensate for limited ankle mobility.', relevance_to_segment='This provides context for the difficulty mentioned in LC1 and suggests a solution for improving squat form.'), GlobalClue(id='GC2', quote=\"so really a squat is just a test of your ankles if you've got good ankles you're great at squatting\", quote_timestamp_start='00:01:39', quote_timestamp_end='00:01:42', clue='This quote emphasizes the importance of ankle mobility in performing a proper squat.', relevance_to_segment='This reinforces the idea presented in LC1 that tight ankles make squatting difficult.')], logical_inferences=[LogicalInference(id='LI1', description='Ankle Mobility Focus', details='The segment likely demonstrates the impact of tight ankles on squat performance and suggests that improving ankle mobility is key to better squatting.'), LogicalInference(id='LI2', description='Trainability of Ankle Mobility', details='The segment emphasizes that, like other muscles and joints, ankle mobility can be improved with consistent training, making squatting easier over time.')])),\n", + " SegmentWithClueInfo(start_timestamp='00:02:18.000', end_timestamp='00:02:27.000', segment_annotation=SegmentAnnotation(local_clues=[LocalClue(id='LC1', quote='first one is the squat sitting in a squat not bobbing up and down just being able to function to sit in the squat', quote_timestamp_start='00:02:18', quote_timestamp_end='00:02:22', clue='This phrase introduces the exercise of sitting in a squat position without movement, emphasizing stability and function.'), LocalClue(id='LC2', quote='something again is innate to us as human beings and something that you are able to do as a child', quote_timestamp_start='00:02:22', quote_timestamp_end='00:02:27', clue='This phrase highlights that the ability to squat is a natural human function that we are born with.')], global_clues=[GlobalClue(id='GC1', quote=\"so by doing 10 minutes every day you'll get into the habit\", quote_timestamp_start='00:02:05', quote_timestamp_end='00:02:08', clue='This quote suggests that consistent daily practice can help regain the ability to squat.', relevance_to_segment='This provides context for the exercise introduced in LC1, emphasizing the importance of regular practice.'), GlobalClue(id='GC2', quote='spend 10 minutes every day getting your squat back', quote_timestamp_start='00:02:31', quote_timestamp_end='00:02:34', clue='This quote reinforces the idea of daily practice to improve squatting ability.', relevance_to_segment='This supports the exercise mentioned in LC1 and LC2, suggesting that daily practice is key to regaining natural squatting ability.')], logical_inferences=[LogicalInference(id='LI1', description='Importance of Daily Practice', details='The segment likely emphasizes the importance of spending a few minutes each day practicing sitting in a squat to regain natural squatting ability.'), LogicalInference(id='LI2', description='Natural Human Function', details='The segment underscores that squatting is an innate human ability that can be restored through consistent practice.')])),\n", + " SegmentWithClueInfo(start_timestamp='00:00:01.000', end_timestamp='00:00:44.000', segment_annotation=SegmentAnnotation(local_clues=[LocalClue(id='LC1', quote='start with your feet slightly wider than shoulder width apart', quote_timestamp_start='00:00:02', quote_timestamp_end='00:00:04', clue='This instruction sets the initial stance for performing a proper squat.'), LocalClue(id='LC2', quote='cross your arms in front so touch your right hand to your left shoulder and vice versa pointing your elbows straight ahead', quote_timestamp_start='00:00:06', quote_timestamp_end='00:00:13', clue='Describes the arm position to maintain balance and proper form during the squat.'), LocalClue(id='LC3', quote='shift your weight to the ball of your feet and bend your knees', quote_timestamp_start='00:00:15', quote_timestamp_end='00:00:18', clue='Guides the weight distribution and initial movement for the squat.'), LocalClue(id='LC4', quote='get as close to 90 degrees as you can looking straight ahead and from here push back up to the starting position', quote_timestamp_start='00:00:21', quote_timestamp_end='00:00:25', clue='Describes the depth of the squat and the motion to return to the starting position.'), LocalClue(id='LC5', quote=\"this is going to be great for strengthening your thighs or your quadriceps as well as your butt or your glutes and it's also going to be great to work on your posture\", quote_timestamp_start='00:00:29', quote_timestamp_end='00:00:37', clue='Explains the benefits of performing squats correctly, including muscle strengthening and posture improvement.'), LocalClue(id='LC6', quote=\"that's how to properly perform a squat\", quote_timestamp_start='00:00:41', quote_timestamp_end='00:00:44', clue='Concludes the segment by summarizing that the instructions provided are for performing a proper squat.')], global_clues=[GlobalClue(id='GC1', quote=\"let's learn how to properly perform a squat\", quote_timestamp_start='00:00:00', quote_timestamp_end='00:00:01', clue='Introduces the topic of the video, which is learning the proper technique for squatting.', relevance_to_segment='This statement sets the context for the entire segment, indicating that the instructions provided are aimed at teaching the correct squat form.'), GlobalClue(id='GC2', quote=\"this is going to be great for strengthening your thighs or your quadriceps as well as your butt or your glutes and it's also going to be great to work on your posture\", quote_timestamp_start='00:00:29', quote_timestamp_end='00:00:37', clue='Explains the benefits of performing squats correctly, including muscle strengthening and posture improvement.', relevance_to_segment='This reinforces the importance of following the proper technique as described in the segment to achieve these benefits.')], logical_inferences=[LogicalInference(id='LI1', description='Step-by-Step Instruction for Proper Squat', details='The segment provides a detailed, step-by-step guide on how to perform a squat correctly, starting from the initial stance to the final position. This includes specific instructions on foot placement, arm positioning, weight distribution, and movement.'), LogicalInference(id='LI2', description='Emphasis on Proper Form and Benefits', details='The segment emphasizes the importance of proper form in performing squats to avoid injury and maximize benefits such as strengthening the quadriceps, glutes, and improving posture.'), LogicalInference(id='LI3', description='Educational Purpose', details='The primary purpose of this segment is educational, aiming to teach viewers the correct technique for squatting to ensure they perform the exercise safely and effectively.')]))],\n", + " 'annotations': ['{\"squats_probability\": \"low\", \"squats_technique_correctness\": null, \"squats_feedback\": {\"right\": null, \"wrong\": null, \"correction\": null}}',\n", + " '{\"squats_probability\": \"high\", \"squats_technique_correctness\": null, \"squats_feedback\": {\"right\": null, \"wrong\": null, \"correction\": null}}',\n", + " '{\"squats_probability\": \"high\", \"squats_technique_correctness\": \"incorrect\", \"squats_feedback\": {\"right\": \"The segment emphasizes the squat as a natural human resting position, which is a correct understanding.\", \"wrong\": \"The segment mentions a common issue where people can\\'t get their heels down on the floor during a squat.\", \"correction\": \"Focus on ankle mobility exercises and practice squatting with support to gradually improve heel placement.\"}}',\n", + " '{\"squats_probability\": \"unknown\", \"squats_technique_correctness\": null, \"squats_feedback\": {\"right\": null, \"wrong\": null, \"correction\": null}}',\n", + " '{\"squats_probability\": \"high\", \"squats_technique_correctness\": null, \"squats_feedback\": {\"right\": null, \"wrong\": null, \"correction\": null}}',\n", + " '{\"squats_probability\": \"high\", \"squats_technique_correctness\": \"incorrect\", \"squats_feedback\": {\"right\": \"The segment emphasizes the importance of ankle mobility in squatting and suggests that it can be improved with training.\", \"wrong\": \"The segment likely demonstrates difficulty in squatting due to tight ankles.\", \"correction\": \"To improve squat depth and form, consider placing your heels on a yoga block or a book to compensate for limited ankle mobility.\"}}',\n", + " '{\"squats_probability\": \"high\", \"squats_technique_correctness\": \"unknown\", \"squats_feedback\": {\"right\": \"The segment emphasizes the importance of stability and function in the squat position, highlighting that squatting is a natural human ability.\", \"wrong\": null, \"correction\": \"Practice sitting in a squat position daily for 10 minutes to regain natural squatting ability.\"}}',\n", + " '{\"squats_probability\": \"high\", \"squats_technique_correctness\": \"correct\", \"squats_feedback\": {\"right\": \"The segment provides a detailed, step-by-step guide on how to perform a squat correctly, including specific instructions on foot placement, arm positioning, weight distribution, and movement.\", \"wrong\": null, \"correction\": null}}']}" ] }, - "execution_count": 14, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -1061,6 +967,27 @@ "outputs": [], "source": [] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null, From 8f10bb2bba043d6d940d5516eb038b1c1141b186 Mon Sep 17 00:00:00 2001 From: deepbuzin Date: Wed, 28 Aug 2024 05:06:06 +0000 Subject: [PATCH 6/9] Refactor agent to give it a file structure --- agent.ipynb | 1020 --------------------------------- agent/agent.py | 113 ++++ agent/agent_parts.ipynb | 946 ++++++++++++++++++++++++++++++ agent/tools/annotating.py | 181 ++++++ agent/tools/prompts.py | 217 +++++++ agent/tools/scraping.py | 150 +++++ agent/tools/video_chunking.py | 162 ++++++ 7 files changed, 1769 insertions(+), 1020 deletions(-) delete mode 100644 agent.ipynb create mode 100644 agent/agent.py create mode 100644 agent/agent_parts.ipynb create mode 100644 agent/tools/annotating.py create mode 100644 agent/tools/prompts.py create mode 100644 agent/tools/scraping.py create mode 100644 agent/tools/video_chunking.py diff --git a/agent.ipynb b/agent.ipynb deleted file mode 100644 index 979ea0d..0000000 --- a/agent.ipynb +++ /dev/null @@ -1,1020 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from dotenv import load_dotenv\n", - "_ = load_dotenv()" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from typing import TypedDict, Annotated, Sequence, List, Optional\n", - "import operator\n", - "\n", - "from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage\n", - "from langchain.pydantic_v1 import BaseModel, Field\n", - "from langchain_core.prompts import ChatPromptTemplate" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "from langchain_openai import AzureChatOpenAI\n", - "\n", - "llm = AzureChatOpenAI(\n", - " temperature=0.0,\n", - " azure_deployment=\"gpt4o\",\n", - " openai_api_version=\"2023-07-01-preview\",\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "class VideoInfo(BaseModel):\n", - " video_id: str\n", - " url: str\n", - " relative_video_path: str\n", - " subs: str\n", - " transcript: str\n", - "\n", - "\n", - "class SegmentInfo(BaseModel):\n", - " start_timestamp: str\n", - " end_timestamp: str\n", - " fps: float\n", - " video_id: str\n", - "\n", - "\n", - "class LocalClue(BaseModel):\n", - " \"\"\"Local clues for a segment\"\"\"\n", - "\n", - " id: str = Field(description=\"LC1,LC2...\")\n", - " quote: str = Field(\n", - " description=\"the quote from the transcript that was used to create this clue.\"\n", - " )\n", - " quote_timestamp_start: str = Field(\n", - " description=\"the exact start timestamp of the quote.\"\n", - " )\n", - " quote_timestamp_end: str = Field(\n", - " description=\"the exact end timestamp of the quote.\"\n", - " )\n", - " clue: str = Field(description=\"the main clue data\")\n", - "\n", - "\n", - "class GlobalClue(BaseModel):\n", - " \"\"\"Global clues for a segment\"\"\"\n", - "\n", - " id: str = Field(description=\"GC1,GC2...\")\n", - " quote: str = Field(\n", - " description=\"the quote from the transcript that was used to create this clue.\"\n", - " )\n", - " quote_timestamp_start: str = Field(\n", - " description=\"the exact start timestamp of the quote.\"\n", - " )\n", - " quote_timestamp_end: str = Field(\n", - " description=\"the exact end timestamp of the quote.\"\n", - " )\n", - " clue: str = Field(description=\"the main clue data.\")\n", - " relevance_to_segment: str = Field(\n", - " description=\"why do you think this global clue is relevant to the segment you are working with right now.\"\n", - " )\n", - "\n", - "\n", - "class LogicalInference(BaseModel):\n", - " \"\"\"Logical inferences for a segment\"\"\"\n", - "\n", - " id: str = Field(description=\"LI1,LI2,...\")\n", - " description: str = Field(description=\"A concise form of the logical inference.\")\n", - " details: str = Field(\n", - " description=\"A verbose explanation of what insight about what happens in this segment should be made based on the clues that you found.\"\n", - " )\n", - "\n", - "\n", - "class SegmentAnnotation(BaseModel):\n", - " local_clues: list[LocalClue] = Field(\n", - " description=\"Local clues are inside the segment in terms of timestamps.\"\n", - " )\n", - " global_clues: list[GlobalClue] = Field(\n", - " description=\"Global clues are scattered across the entire transcript.\"\n", - " )\n", - " logical_inferences: list[LogicalInference] = Field(\n", - " description=\"What can we infer about the topic, that the user is looking for in the video, can we make based on the clues inside this segment\"\n", - " )\n", - "\n", - "\n", - "class SegmentWithClueInfo(BaseModel):\n", - " \"\"\"\n", - " Annotation for a video segment.\n", - " \"\"\"\n", - "\n", - " start_timestamp: str = Field(\n", - " description=\"start timestamp of the segment in format HH:MM:SS.MS\"\n", - " )\n", - " end_timestamp: str = Field(\n", - " description=\"start timestamp of the segment in format HH:MM:SS.MS\"\n", - " )\n", - " segment_annotation: SegmentAnnotation = Field(\n", - " description=\"list of annotations for the segment\"\n", - " )\n", - "\n", - "\n", - "class VideoAnnotation(BaseModel):\n", - " \"\"\"\n", - " Segments of a video.\n", - " \"\"\"\n", - "\n", - " segments: list[SegmentWithClueInfo] = Field(\n", - " description=\"information about each segment\"\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "# 2. Create the state\n", - "\n", - "\n", - "class AgentState(TypedDict):\n", - " task: str\n", - " search_queries: List[str]\n", - " video_ids: List[str]\n", - " video_infos: List[VideoInfo]\n", - " clip_text_prompts: List[str]\n", - " segment_infos: List[SegmentInfo]\n", - " clues: List[str]\n", - " annotations: List[str]" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "# 3. Set prompts\n", - "\n", - "GEN_QUERIES_PROMPT = (\n", - " \"You a helping the user to find a very large and diverse set of videos on a video hosting service.\",\n", - " \"A user will only describe which videos they are looking for and how many queries they need.\",\n", - ")\n", - "\n", - "# prompt='I want to find instructional videos about how to do squats.',\n", - "# num_queries_prompt = f'I need {num_queries} queries'\n", - "\n", - "EXTRACT_CLUES_PROMPT = \"\"\"You are a highly intelligent data investigator. \n", - "You take unstructured damaged data and look for clues that could help restore the initial information\n", - "and extract important insights from it.\n", - "You are the best one for this job in the world because you are a former detective. \n", - "You care about even the smallest details, and your guesses about what happened in the initial file\n", - "even at very limited inputs are usually absolutely right. \n", - "You use deductive and inductive reasoning at the highest possible quality.\n", - "\n", - "#YOUR TODAY'S JOB\n", - "The user needs to learn about what happens in a specific segment of a video file. Your job is to help the user by providing clues that would help the user make the right assumption.\n", - "The user will provide you with: \n", - "1. Instructions about what kind of information the user is trying to obtain.\n", - "2. A list of time codes of the segments in format \"-\". All the provided segment of the video contain what the user is looking for, but other parts of the video might have different content.\n", - "3. A transcript of the *full video* in format of \"\\\\n\"\n", - "\n", - "Your task:\n", - "1. Read the transcript.\n", - "2. Provide the clues in a given format.\n", - "3. Provied any other info requested by the user.\n", - "\n", - "#RULES\n", - "!!! VERY IMPORTANT !!!\n", - "1. Rely only on the data provided in the transcript. Do not improvise. All the quotes and corresponding timestamps must be taken from the transcript. Quote timestamps must be taken directly from the transcript.\n", - "2. Your job is to find the data already provided in the transcript.\n", - "3. Analyze every segment. Only skip a segment if there is no information about it in the trascript.\n", - "4. For local clues, make sure that the quotes that you provide are located inside the segment. To do this, double check the timestamps from the transcript and the segment.\n", - "5. For all clues, make sure that the quotes exactly correspond to the timestamps that you provide.\n", - "6. When making clues, try as much as possible to make them describe specifically what is shown in the segment.\n", - "7. Follow the format output.\n", - "8. Be very careful with details. Don't generalize. Always double check your results.\n", - "\n", - "Please, help the user find relevant clues to reconstruct the information they are looking for, for each provided segment.\n", - "\n", - "WHAT IS A CLUE: A *clue*, in the context of reconstructing narratives from damaged data, \n", - "is a fragment of information extracted from a corrupted or incomplete source that provides \n", - "insight into the original content. These fragments serve as starting points for inference \n", - "and deduction, allowing researchers to hypothesize about the fuller context or meaning of \n", - "the degraded material. The process of identifying and interpreting clues involves both objective analysis of the \n", - "available data and subjective extrapolation based on domain knowledge, contextual understanding, \n", - "and logical reasoning.\n", - "\n", - "Here is what the user expects to have from you:\n", - "1. *Local clues* that would help the user undestand how the thing they are looking for happens inside the segment. Local clues for a segment are generated from quotes inside a specific segment.\n", - "2. *Global clues* that would help the user understand how the thing they are looking for happens inside the segment. Global clues for a segment are generated from quotes all around the video, but are very relevant to the specific that they are provided for.\n", - "3. *Logical inferences* that could help the user understand how the thing they are looking for happens inside the segment. Logical inferences for a segment are deducted from local and global clues for this segment.\n", - "\n", - "!!!IT IS EXTREMELY IMPORTANT TO DELIVER ALL THREE THINGS!!!\n", - "\n", - " Good local clues examples: [\n", - " {\n", - " \"id\": \"LC1\",\n", - " \"timestamp\": \"00:00:19\",\n", - " \"quote\": \"exercises do them wrong and instead of\",\n", - " \"clue\": \"This phrase introduces the concept of incorrect exercise form, setting the stage for a demonstration of improper technique.\"\n", - " },\n", - " {\n", - " \"id\": \"LC2\",\n", - " \"timestamp\": \"00:00:21\",\n", - " \"quote\": \"growing nice quads and glutes you'll\",\n", - " \"clue\": \"Mentions the expected benefits of proper squats (muscle growth), implying that these benefits won't be achieved with incorrect form.\"\n", - " },\n", - " {\n", - " \"id\": \"LC3\",\n", - " \"timestamp\": \"00:00:22\",\n", - " \"quote\": \"feel aches and pains in your knees your\",\n", - " \"clue\": \"Directly states negative consequences of improper form, strongly suggesting that this segment demonstrates incorrect technique.\"\n", - " },\n", - " {\n", - " \"id\": \"LC4\",\n", - " \"timestamp\": \"00:00:24\",\n", - " \"quote\": \"lower back and even your shoulders\",\n", - " \"clue\": \"Continuation of LC3, emphasizing multiple areas of potential pain from improper form.\"\n", - " },\n", - " {\n", - " \"id\": \"LC5\",\n", - " \"timestamp\": \"00:00:26\",\n", - " \"quote\": \"let's see how to do it correctly\",\n", - " \"clue\": \"This phrase suggests a transition is about to occur. The incorrect form has been shown, and correct form will follow.\"\n", - " }\n", - " ]\n", - "\n", - " Double check that the timestamp and the quote that you provide exactly correspond to what you found in the transcript.\n", - " For example, if the transcript says:\n", - " \"00:05:02\n", - " he took the glasses\n", - " 00:05:04\n", - " and gave them to me\"\n", - " Then a GOOD output will be:\n", - " - timestamp: 00:05:03\n", - " - quote: \"he took the glasses and gave them to me\"\n", - " And a BAD output would be:\n", - " - timestamp: 00:04:02\n", - " - quote: \"he gave me the glasses\"\n", - "\n", - " Good global clues examples: [\n", - " {\n", - " \"id\": \"GC1\",\n", - " \"timestamp\": \"00:01:15\",\n", - " \"quote\": \"Before we dive into specific techniques, let's talk about safety.\",\n", - " \"clue\": \"Introduces the theme of safety in squatting.\",\n", - " \"relevance_to_segment\": \"This earlier emphasis on safety provides context for why proper depth is important and why it's being addressed in our segment. It connects to the fear of knee pain mentioned in LC3.\"\n", - " },\n", - " {\n", - " \"id\": \"GC2\",\n", - " \"timestamp\": \"00:02:30\",\n", - " \"quote\": \"Squatting is a fundamental movement pattern in everyday life.\",\n", - " \"clue\": \"Emphasizes the importance of squats beyond just exercise.\",\n", - " \"relevance_to_segment\": \"This broader context heightens the importance of learning proper squat depth as demonstrated in our segment. It suggests that the techniques shown have applications beyond just gym workouts.\"\n", - " },\n", - " {\n", - " \"clue_id\": \"GC3\",\n", - " \"timestamp\": \"00:05:20\",\n", - " \"quote\": \"If you have existing knee issues, consult a physician before attempting deep squats.\",\n", - " \"clue\": \"Provides a health disclaimer related to squat depth.\",\n", - " \"relevance_to_segment\": \"While this comes after our segment, it's relevant because it addresses the concern about knee pain mentioned in LC3. It suggests that the demonstration in our segment is generally safe but acknowledges individual variations.\"\n", - " },\n", - " {\n", - " \"clue_id\": \"GC4\",\n", - " \"timestamp\": \"00:06:45\",\n", - " \"quote\": \"Proper depth ensures full engagement of your quadriceps and glutes.\",\n", - " \"clue\": \"Explains the benefit of correct squat depth.\",\n", - " \"relevance_to_segment\": \"This later explanation provides justification for the depth guideline given in LC4. It helps viewers understand why the demonstrated technique is important.\"\n", - " },\n", - " {\n", - " \"clue_id\": \"GC5\",\n", - " \"timestamp\": \"00:00:30\",\n", - " \"quote\": \"Today, we'll cover squat variations for beginners to advanced lifters.\",\n", - " \"clue\": \"Outlines the scope of the entire video.\",\n", - " \"relevance_to_segment\": \"This early statement suggests that our segment, focusing on proper depth, is part of a comprehensive guide. It implies that the demonstration might be adaptable for different skill levels.\"\n", - " }\n", - " ]\n", - " Double check that the timestamp and the quote that you provide exactly correspond to what you found in the transcript.\n", - " For example, if the transcript says:\n", - " \"00:05:02\n", - " he took the glasses\n", - " 00:05:04\n", - " and gave them to me\"\n", - " Then a GOOD output will be:\n", - " - timestamp: 00:05:03\n", - " - quote: \"he took the glasses and gave them to me\"\n", - " And a BAD output would be:\n", - " - timestamp: 00:04:02\n", - " - quote: \"he gave me the glasses\"\n", - " \n", - "\n", - " Good logical inference examples:\n", - " [\n", - " {\n", - " \"id\": \"LI1\",\n", - " \"description\": \"Primary Demonstration of Heel Lift\",\n", - " \"details\": \"Given that GC1-GC3 describe the 'most common mistake' as heels lifting off the ground, and this description immediately precedes our segment, it's highly probable that this is the primary error being demonstrated. This is further supported by the segment's focus on incorrect form (LC1-LC4).\"\n", - " },\n", - " {\n", - " \"id\": \"LI2\",\n", - " \"description\": \"Multiple Error Demonstration\",\n", - " \"details\": \"While heel lift is likely the primary focus, the mention of multiple pain points (knees, lower back, shoulders in LC3-LC4) suggests that the demonstrator may be exhibiting several forms of incorrect technique simultaneously. This comprehensive 'what not to do' approach would be pedagogically effective.\"\n", - " },\n", - " {\n", - " \"id\": \"LI3\",\n", - " \"description\": \"Possible Inclusion of 'Butt Wink'\",\n", - " \"details\": \"Although 'butt wink' is mentioned after our segment (GC4-GC6), its connection to back pain (which is mentioned in LC4) raises the possibility that this error is also present in the demonstration. The instructor may be showing multiple errors early on, then breaking them down individually later.\"\n", - " },\n", - " {\n", - " \"id\": \"LI4\",\n", - " \"description\": \"Segment Placement in Overall Video Structure\",\n", - " \"details\": \"The segment's position (starting at 00:00:19) and the phrase 'let's see how to do it correctly' (LC5) at the end suggest this is an early, foundational part of the video. It likely serves to grab attention by showing common mistakes before transitioning to proper form instruction.\"\n", - " },\n", - " {\n", - " \"id\": \"LI5\",\n", - " \"description\": \"Intentional Exaggeration of Errors\",\n", - " \"details\": \"Given the educational nature of the video, it's plausible that the demonstrator is intentionally exaggerating the incorrect form. This would make the errors more obvious to viewers and enhance the contrast with correct form shown later.\"\n", - " }\n", - " ]\n", - "\"\"\"\n", - "\n", - "\n", - "GEN_ANNOTATIONS_PROMPT = \"\"\"You are a helpful assistant that performs high quality data investigation and transformation.\n", - " You will be given a JSON object with clues and other helpful information about what's going on \n", - " in a specific part of a video file. This part is called a segment. Your job is to:\n", - " 1. Read this JSON object carefully\n", - " 2. Answer user's questions about this segment\n", - " 3. Provide the answer as a JSON object in a schema provided by the user\n", - " Important rules:\n", - " 1. You can only rely on data presented in a provided JSON object. Don't improvise.\n", - " 2. Follow user's request carefully.\n", - " 3. Don't rush to deliver the answer. Take some time to think. Make a deep breath. Then start writing.\n", - " 4. If you want to output field as empty (null), output it as JSON null (without quotes), not as a string \"null\". \n", - "—> GOOD EXAMPLES:\n", - " \"wrong\":\"Knees caving in: This can stress the knees and reduce effectiveness\"\n", - " \"correction\":\"Focus on keeping knees aligned with your toes.\"\n", - " \"wrong\":\"Rounding the back: This increases the risk of back injuries\"\n", - " \"correction\":\"Keep your chest up and maintain a neutral spine throughout the movement.\"\n", - " \"wrong\":\"Heels are lifting off the ground: this shifts the weight forward, reducing stability\"\n", - " \"correction\":\" Keep your weight on your heels and press through them as you rise.\"\n", - " \"right\":\"Chest and shoulders: The chest is up, and the shoulders are back, maintaining an upright torso.\"\n", - " \"correction\":null\n", - "—> BAD EXAMPLES:\n", - " \"wrong\":\"knees\"\n", - " \"correction\":\"fix knees\"\n", - " \"wrong\":\"back looks funny\"\n", - " \"correction\":\"make back better\"\n", - " \"wrong\":\"feet are doing something\"\n", - " \"correction\":\"feet should be different\"\n", - " \"right\":\"arms\"\n", - " \"correction\":\"arms are fine i think\"\n", - "—> BAD EXAMPLES END HERE\n", - "\"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "import scrapetube\n", - "import yt_dlp\n", - "from datetime import datetime\n", - "from pathlib import Path\n", - "from collections import defaultdict\n", - "from datagen.core.sub_utils import vtt_to_txt\n", - "from datagen.detect_segments import get_segments\n", - "import torch\n", - "from transformers import AutoModel, AutoProcessor\n", - "import pandas as pd\n", - "from tsmoothie.smoother import LowessSmoother" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "import decord\n", - "import math\n", - "import numpy as np\n", - "\n", - "# decord.bridge.set_bridge(\"torch\")\n", - "\n", - "\n", - "class VideoInferenceDataset(torch.utils.data.IterableDataset):\n", - " def __init__(self, video_infos: List[VideoInfo], local_root: Path):\n", - " super(VideoInferenceDataset).__init__()\n", - "\n", - " self.video_infos = video_infos\n", - " self.local_root = local_root\n", - " self.frame_generator = self.get_frame_generator(video_infos, local_root)\n", - "\n", - " @staticmethod\n", - " def get_frame_generator(video_infos, local_root: Path):\n", - "\n", - " for video_idx, video_info in enumerate(video_infos):\n", - " video_path = local_root.joinpath(video_info.relative_video_path)\n", - " vr = decord.VideoReader(str(video_path))\n", - " num_frames = len(vr)\n", - " fps = vr.get_avg_fps()\n", - " frame_indices = range(0, num_frames, round(fps))\n", - "\n", - " for frame_idx in frame_indices:\n", - " # print(f\"Frame idx {frame_idx}\")\n", - " frame = vr[frame_idx].asnumpy()\n", - " yield {\n", - " \"frame\": frame,\n", - " \"frame_idx\": frame_idx,\n", - " \"video_id\": video_idx,\n", - " }\n", - "\n", - " def __next__(self):\n", - " return next(self.frame_generator)\n", - "\n", - " def __iter__(self):\n", - " return self" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "import time\n", - "import math\n", - "\n", - "# 4. Create nodes\n", - "\n", - "\n", - "def gen_queries_node(state: AgentState):\n", - " class QueryList(BaseModel):\n", - " \"\"\"A list of queries to find videos on a video hosting service\"\"\"\n", - "\n", - " search_queries: list[str] = Field(default=None, description=\"a list of queries\")\n", - "\n", - " messages = [\n", - " SystemMessage(content=str(GEN_QUERIES_PROMPT)),\n", - " HumanMessage(content=state[\"task\"]),\n", - " ]\n", - "\n", - " model = llm.with_structured_output(QueryList)\n", - " response: QueryList = model.invoke(messages)\n", - "\n", - " return {\"search_queries\": response.search_queries[:2]}\n", - "\n", - "\n", - "def get_video_ids_node(state: AgentState):\n", - "\n", - " queries = state[\"search_queries\"]\n", - " videos_per_query = 1\n", - " sleep = 0\n", - " sort_by = \"relevance\"\n", - " results_type = \"video\"\n", - " only_creative_commons = False\n", - "\n", - " video_ids = set()\n", - " for query in queries:\n", - " for video in scrapetube.get_search(\n", - " query=query,\n", - " limit=videos_per_query,\n", - " sleep=sleep,\n", - " sort_by=sort_by,\n", - " results_type=results_type,\n", - " ):\n", - " video_ids.add(video[\"videoId\"])\n", - " video_ids = list(video_ids)\n", - "\n", - " if only_creative_commons:\n", - " video_ids_cc = []\n", - " for i in video_ids:\n", - " YDL_OPTIONS = {\n", - " \"quiet\": True,\n", - " \"simulate\": True,\n", - " \"forceurl\": True,\n", - " }\n", - " with yt_dlp.YoutubeDL(YDL_OPTIONS) as ydl:\n", - " info = ydl.extract_info(f\"youtube.com/watch?v={i}\", download=False)\n", - " if \"creative commons\" in info.get(\"license\", \"\").lower():\n", - " video_ids_cc.append(i)\n", - " video_ids = video_ids_cc\n", - "\n", - " return {\"video_ids\": video_ids}\n", - "\n", - "\n", - "def download_node(state: AgentState):\n", - "\n", - " LOCAL_ROOT = Path(\"./tmp/agent_squats\").resolve()\n", - " video_dir = LOCAL_ROOT / \"videos\"\n", - " sub_dir = LOCAL_ROOT / \"subs\"\n", - "\n", - " discard_path = LOCAL_ROOT / \"videos_without_subs\"\n", - " discard_path.mkdir(parents=True, exist_ok=True)\n", - "\n", - " video_ids = state[\"video_ids\"]\n", - "\n", - " downloaded_video_ids = [video_path.stem for video_path in video_dir.glob(\"*.mp4\")]\n", - " downloaded_video_ids += [\n", - " video_path.stem for video_path in discard_path.glob(\"*.mp4\")\n", - " ]\n", - "\n", - " print(f\"Downloaded video ids: {downloaded_video_ids}\")\n", - "\n", - " only_with_transcripts = True\n", - "\n", - " YDL_OPTIONS = {\n", - " \"writeautomaticsub\": True,\n", - " \"subtitleslangs\": [\"en\"],\n", - " \"subtitlesformat\": \"vtt\",\n", - " \"overwrites\": False,\n", - " \"format\": \"mp4\",\n", - " \"outtmpl\": {\n", - " \"default\": video_dir.as_posix() + \"/%(id)s.%(ext)s\",\n", - " \"subtitle\": sub_dir.as_posix() + \"/%(id)s.%(ext)s\",\n", - " },\n", - " }\n", - "\n", - " video_infos = []\n", - "\n", - " with yt_dlp.YoutubeDL(YDL_OPTIONS) as ydl:\n", - " for video_id in video_ids:\n", - " url = f\"https://www.youtube.com/watch?v={video_id}\"\n", - "\n", - " if video_id not in downloaded_video_ids:\n", - " try:\n", - " ydl.download(url)\n", - " except Exception as e:\n", - " print(datetime.now(), f\"Error at video {video_id}, skipping\")\n", - " print(datetime.now(), e)\n", - " continue\n", - "\n", - " video_path = Path(ydl.prepare_filename({\"id\": video_id, \"ext\": \"mp4\"}))\n", - " sub_path = Path(\n", - " ydl.prepare_filename(\n", - " {\"id\": video_id, \"ext\": \"en.vtt\"}, dir_type=\"subtitle\"\n", - " )\n", - " )\n", - "\n", - " with sub_path.open(\"r\") as f:\n", - " subs = f.read()\n", - "\n", - " transcript = vtt_to_txt(sub_path)\n", - "\n", - " video_info = VideoInfo(\n", - " video_id=video_id,\n", - " url=url,\n", - " relative_video_path=video_path.relative_to(LOCAL_ROOT).as_posix(),\n", - " subs=subs,\n", - " transcript=transcript,\n", - " )\n", - "\n", - " video_infos.append(video_info)\n", - "\n", - " if only_with_transcripts:\n", - " filtered_video_infos = []\n", - " for video_info in video_infos:\n", - " if video_info.transcript:\n", - " filtered_video_infos.append(video_info)\n", - " else:\n", - " video_path = LOCAL_ROOT / video_info.video_path\n", - " video_path.rename(discard_path / video_path.name)\n", - " video_infos = filtered_video_infos\n", - "\n", - " return {\"video_infos\": video_infos}\n", - "\n", - "\n", - "def detect_segments_node(state: AgentState):\n", - "\n", - " LOCAL_ROOT = Path(\"./tmp/agent_squats\").resolve()\n", - "\n", - " clip_text_prompts = state[\"clip_text_prompts\"]\n", - " video_infos = state[\"video_infos\"]\n", - "\n", - " CLIP_MODEL_ID = \"google/siglip-so400m-patch14-384\"\n", - "\n", - " model = AutoModel.from_pretrained(CLIP_MODEL_ID).to(\"cuda\")\n", - " processor = AutoProcessor.from_pretrained(CLIP_MODEL_ID)\n", - "\n", - " dataset = VideoInferenceDataset(video_infos, LOCAL_ROOT)\n", - "\n", - " dataloader = torch.utils.data.DataLoader(\n", - " dataset,\n", - " num_workers=1,\n", - " batch_size=12,\n", - " pin_memory=True,\n", - " # worker_init_fn=worker_init_fn,\n", - " )\n", - " dataloader = iter(dataloader)\n", - "\n", - " smoother = LowessSmoother(smooth_fraction=0.02, iterations=1)\n", - "\n", - " clip_results_dict = defaultdict(list)\n", - "\n", - " print(\"Init model complete\")\n", - "\n", - " batch_counter = 0\n", - " MAX_BATCHES = 50\n", - "\n", - " while batch_counter < MAX_BATCHES:\n", - " batch_counter += 1\n", - " try:\n", - " start_time = time.time()\n", - " batch = next(dataloader)\n", - " # print(f\"Fetch time: {time.time() - start_time:.2f} seconds\")\n", - " except StopIteration:\n", - " break\n", - "\n", - " inputs = processor(\n", - " images=batch[\"frame\"],\n", - " text=clip_text_prompts,\n", - " return_tensors=\"pt\",\n", - " padding=True,\n", - " truncation=True,\n", - " )\n", - " inputs = {k: v.to(\"cuda\") for k, v in inputs.items()}\n", - "\n", - " outputs = model(**inputs)\n", - "\n", - " logits = outputs.logits_per_image\n", - " probs = torch.nn.functional.sigmoid(logits).detach().cpu().numpy()\n", - "\n", - " for video_idx, frame_idx, prob in zip(\n", - " batch[\"video_id\"], batch[\"frame_idx\"], probs\n", - " ):\n", - " # print(type(video_id.item()), type(frame_idx.item()), type(prob.item()))\n", - " video_id = video_infos[video_idx.item()].video_id\n", - "\n", - " clip_results_dict[\"video_id\"].append(video_id)\n", - " clip_results_dict[\"frame_idx\"].append(frame_idx.item())\n", - " clip_results_dict[\"probs\"].append(prob.item())\n", - "\n", - " print(\"All frames processed\")\n", - " clip_results = pd.DataFrame(clip_results_dict)\n", - " print(\"Dataframe created\")\n", - " print(clip_results)\n", - "\n", - " max_gap_seconds = 1\n", - " fps_sampling = 1\n", - " min_prob = 0.1\n", - " min_segment_seconds = 3\n", - " fps = 25\n", - "\n", - " segment_infos = []\n", - " for video_id, video_clip_results in clip_results.groupby(\"video_id\"):\n", - " probs = video_clip_results[\"probs\"].values\n", - " probs = smoother.smooth(probs).smooth_data[0]\n", - " segments_start_end = get_segments(\n", - " probs,\n", - " max_gap=round(max_gap_seconds * fps_sampling),\n", - " min_prob=min_prob,\n", - " min_segment=round(min_segment_seconds * fps_sampling),\n", - " )\n", - "\n", - " print(f\"Segments for video {video_id}: {segments_start_end}\")\n", - "\n", - " sec2ts = lambda s: time.strftime(\n", - " f\"%H:%M:%S.{round((s%1)*1000):03d}\", time.gmtime(s)\n", - " )\n", - "\n", - " for start, end in segments_start_end:\n", - " segment_infos.append(\n", - " SegmentInfo(\n", - " start_timestamp=sec2ts(start),\n", - " end_timestamp=sec2ts(end),\n", - " fps=fps,\n", - " video_id=video_id,\n", - " )\n", - " )\n", - "\n", - " return {\"segment_infos\": segment_infos}\n", - "\n", - "\n", - "def extract_clues_node(state: AgentState):\n", - "\n", - " prompt_template = ChatPromptTemplate.from_messages(\n", - " [\n", - " (\"system\", EXTRACT_CLUES_PROMPT),\n", - " (\n", - " \"user\",\n", - " \"Segment timecodes: {{ segment_timecodes }}\\nTranscript: {{ transcript }}\",\n", - " ),\n", - " ],\n", - " template_format=\"jinja2\",\n", - " )\n", - "\n", - " model = prompt_template | llm.with_structured_output(VideoAnnotation)\n", - "\n", - " segment_infos_dict = defaultdict(list)\n", - " for segment_info in state[\"segment_infos\"]:\n", - " segment_infos_dict[segment_info.video_id].append(segment_info)\n", - "\n", - " video_infos_dict = {\n", - " video_info.video_id: video_info for video_info in state[\"video_infos\"]\n", - " }\n", - "\n", - " clues = []\n", - "\n", - " for video_id, segment_infos in segment_infos_dict.items():\n", - " transcript = video_infos_dict[video_id].transcript\n", - " segment_infos_chunks = [\n", - " segment_infos[i : i + 5] for i in range(0, len(segment_infos), 5)\n", - " ]\n", - "\n", - " for chunk in segment_infos_chunks:\n", - " video_annotation: VideoAnnotation = model.invoke(\n", - " {\n", - " \"segment_timecodes\": \"\\n\".join(\n", - " [f\"{s.start_timestamp}-{s.end_timestamp}\" for s in chunk]\n", - " ),\n", - " \"transcript\": transcript,\n", - " }\n", - " )\n", - " clues.extend(video_annotation.segments)\n", - "\n", - " return {\"clues\": clues}\n", - "\n", - "\n", - "def gen_annotations_node(state: AgentState):\n", - " class SegmentFeedback(BaseModel):\n", - " right: Optional[str] = Field(description=\"what was right in the performance\")\n", - " wrong: Optional[str] = Field(description=\"what was wrong in the performance\")\n", - " correction: Optional[str] = Field(\n", - " description=\"how and in what ways it the performance could be improved\"\n", - " )\n", - "\n", - " # The segment timestamps are taken from the provided information.\n", - " class SegmentCompleteAnnotation(BaseModel):\n", - " squats_probability: Optional[str] = Field(\n", - " description=\"how high is the probability that the person is doing squats in the segment: low, medium, high, unknown(null)\"\n", - " )\n", - " squats_technique_correctness: Optional[str] = Field(\n", - " description=\"correctness of the squat technique.\"\n", - " )\n", - " squats_feedback: Optional[SegmentFeedback] = Field(\n", - " description=\"what was right and wrong in the squat perfomance in the segment. When the technique is incorrect, provide instructions how to correct them.\"\n", - " )\n", - "\n", - " prompt_template = ChatPromptTemplate.from_messages(\n", - " [\n", - " (\"system\", GEN_ANNOTATIONS_PROMPT),\n", - " (\"user\", \"Clues: {{ clues }}\"),\n", - " ],\n", - " template_format=\"jinja2\",\n", - " )\n", - "\n", - " model = prompt_template | llm.with_structured_output(SegmentCompleteAnnotation)\n", - "\n", - " clues = state[\"clues\"]\n", - "\n", - " annotations = []\n", - " for clue in clues:\n", - " segment_annotation: SegmentCompleteAnnotation = model.invoke(\n", - " {\"clues\": clue.json()}\n", - " )\n", - "\n", - " annotations.append(segment_annotation.json())\n", - "\n", - " print(annotations)\n", - "\n", - " return {\"annotations\": annotations}" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "from langgraph.graph import StateGraph, END\n", - "from typing import TypedDict, Annotated, List\n", - "import operator\n", - "from langgraph.checkpoint.memory import MemorySaver\n", - "\n", - "from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage, AIMessage, ChatMessage\n", - "\n", - "memory = MemorySaver()\n", - "# memory = SqliteSaver.from_conn_string(\":memory:\")" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "builder = StateGraph(AgentState)\n", - "\n", - "builder.add_node(\"generate_queries\", gen_queries_node)\n", - "builder.add_node(\"get_video_ids\", get_video_ids_node)\n", - "builder.add_node(\"download\", download_node)\n", - "builder.add_node(\"detect_segments\", detect_segments_node)\n", - "builder.add_node(\"extract_clues\", extract_clues_node)\n", - "builder.add_node(\"gen_annotations\", gen_annotations_node)\n", - "\n", - "builder.set_entry_point(\"generate_queries\")\n", - "\n", - "# builder.add_conditional_edges(\n", - "# \"generate\",\n", - "# should_continue,\n", - "# {END: END, \"reflect\": \"reflect\"}\n", - "# )\n", - "\n", - "builder.add_edge(\"generate_queries\", \"get_video_ids\")\n", - "builder.add_edge(\"get_video_ids\", \"download\")\n", - "builder.add_edge(\"download\", \"detect_segments\")\n", - "builder.add_edge(\"detect_segments\", \"extract_clues\")\n", - "builder.add_edge(\"extract_clues\", \"gen_annotations\")\n", - "builder.add_edge(\"gen_annotations\", END)\n", - "\n", - "graph = builder.compile(checkpointer=memory)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'generate_queries': {'search_queries': ['how to do squats', 'squat exercise tutorial']}}\n", - "{'get_video_ids': {'video_ids': ['xqvCmoLULNY', 'IB_icWRzi4E']}}\n", - "Downloaded video ids: ['IB_icWRzi4E', 'xqvCmoLULNY']\n", - "dowload happened\n", - "Init model complete\n", - "All frames processed\n", - "Dataframe created\n", - " video_id frame_idx probs\n", - "0 xqvCmoLULNY 0 2.199925e-08\n", - "1 xqvCmoLULNY 24 1.503990e-01\n", - "2 xqvCmoLULNY 48 1.242190e-01\n", - "3 xqvCmoLULNY 72 1.302760e-01\n", - "4 xqvCmoLULNY 96 1.310861e-01\n", - ".. ... ... ...\n", - "220 IB_icWRzi4E 4275 2.498681e-07\n", - "221 IB_icWRzi4E 4300 3.288528e-07\n", - "222 IB_icWRzi4E 4325 3.445720e-07\n", - "223 IB_icWRzi4E 4350 3.333991e-07\n", - "224 IB_icWRzi4E 4375 2.660451e-07\n", - "\n", - "[225 rows x 3 columns]\n", - "Segments for video IB_icWRzi4E: [(0, 5), (9, 24), (29, 45), (49, 53), (62, 66), (103, 109), (138, 147)]\n", - "Segments for video xqvCmoLULNY: [(1, 44)]\n", - "{'detect_segments': {'segment_infos': [SegmentInfo(start_timestamp='00:00:00.000', end_timestamp='00:00:05.000', fps=25.0, video_id='IB_icWRzi4E'), SegmentInfo(start_timestamp='00:00:09.000', end_timestamp='00:00:24.000', fps=25.0, video_id='IB_icWRzi4E'), SegmentInfo(start_timestamp='00:00:29.000', end_timestamp='00:00:45.000', fps=25.0, video_id='IB_icWRzi4E'), SegmentInfo(start_timestamp='00:00:49.000', end_timestamp='00:00:53.000', fps=25.0, video_id='IB_icWRzi4E'), SegmentInfo(start_timestamp='00:01:02.000', end_timestamp='00:01:06.000', fps=25.0, video_id='IB_icWRzi4E'), SegmentInfo(start_timestamp='00:01:43.000', end_timestamp='00:01:49.000', fps=25.0, video_id='IB_icWRzi4E'), SegmentInfo(start_timestamp='00:02:18.000', end_timestamp='00:02:27.000', fps=25.0, video_id='IB_icWRzi4E'), SegmentInfo(start_timestamp='00:00:01.000', end_timestamp='00:00:44.000', fps=25.0, video_id='xqvCmoLULNY')]}}\n", - "extract_clues happened\n", - "['{\"squats_probability\": \"low\", \"squats_technique_correctness\": null, \"squats_feedback\": {\"right\": null, \"wrong\": null, \"correction\": null}}', '{\"squats_probability\": \"high\", \"squats_technique_correctness\": null, \"squats_feedback\": {\"right\": null, \"wrong\": null, \"correction\": null}}', '{\"squats_probability\": \"high\", \"squats_technique_correctness\": \"incorrect\", \"squats_feedback\": {\"right\": \"The segment emphasizes the squat as a natural human resting position, which is a correct understanding.\", \"wrong\": \"The segment mentions a common issue where people can\\'t get their heels down on the floor during a squat.\", \"correction\": \"Focus on ankle mobility exercises and practice squatting with support to gradually improve heel placement.\"}}', '{\"squats_probability\": \"unknown\", \"squats_technique_correctness\": null, \"squats_feedback\": {\"right\": null, \"wrong\": null, \"correction\": null}}', '{\"squats_probability\": \"high\", \"squats_technique_correctness\": null, \"squats_feedback\": {\"right\": null, \"wrong\": null, \"correction\": null}}', '{\"squats_probability\": \"high\", \"squats_technique_correctness\": \"incorrect\", \"squats_feedback\": {\"right\": \"The segment emphasizes the importance of ankle mobility in squatting and suggests that it can be improved with training.\", \"wrong\": \"The segment likely demonstrates difficulty in squatting due to tight ankles.\", \"correction\": \"To improve squat depth and form, consider placing your heels on a yoga block or a book to compensate for limited ankle mobility.\"}}', '{\"squats_probability\": \"high\", \"squats_technique_correctness\": \"unknown\", \"squats_feedback\": {\"right\": \"The segment emphasizes the importance of stability and function in the squat position, highlighting that squatting is a natural human ability.\", \"wrong\": null, \"correction\": \"Practice sitting in a squat position daily for 10 minutes to regain natural squatting ability.\"}}', '{\"squats_probability\": \"high\", \"squats_technique_correctness\": \"correct\", \"squats_feedback\": {\"right\": \"The segment provides a detailed, step-by-step guide on how to perform a squat correctly, including specific instructions on foot placement, arm positioning, weight distribution, and movement.\", \"wrong\": null, \"correction\": null}}']\n", - "{'gen_annotations': {'annotations': ['{\"squats_probability\": \"low\", \"squats_technique_correctness\": null, \"squats_feedback\": {\"right\": null, \"wrong\": null, \"correction\": null}}', '{\"squats_probability\": \"high\", \"squats_technique_correctness\": null, \"squats_feedback\": {\"right\": null, \"wrong\": null, \"correction\": null}}', '{\"squats_probability\": \"high\", \"squats_technique_correctness\": \"incorrect\", \"squats_feedback\": {\"right\": \"The segment emphasizes the squat as a natural human resting position, which is a correct understanding.\", \"wrong\": \"The segment mentions a common issue where people can\\'t get their heels down on the floor during a squat.\", \"correction\": \"Focus on ankle mobility exercises and practice squatting with support to gradually improve heel placement.\"}}', '{\"squats_probability\": \"unknown\", \"squats_technique_correctness\": null, \"squats_feedback\": {\"right\": null, \"wrong\": null, \"correction\": null}}', '{\"squats_probability\": \"high\", \"squats_technique_correctness\": null, \"squats_feedback\": {\"right\": null, \"wrong\": null, \"correction\": null}}', '{\"squats_probability\": \"high\", \"squats_technique_correctness\": \"incorrect\", \"squats_feedback\": {\"right\": \"The segment emphasizes the importance of ankle mobility in squatting and suggests that it can be improved with training.\", \"wrong\": \"The segment likely demonstrates difficulty in squatting due to tight ankles.\", \"correction\": \"To improve squat depth and form, consider placing your heels on a yoga block or a book to compensate for limited ankle mobility.\"}}', '{\"squats_probability\": \"high\", \"squats_technique_correctness\": \"unknown\", \"squats_feedback\": {\"right\": \"The segment emphasizes the importance of stability and function in the squat position, highlighting that squatting is a natural human ability.\", \"wrong\": null, \"correction\": \"Practice sitting in a squat position daily for 10 minutes to regain natural squatting ability.\"}}', '{\"squats_probability\": \"high\", \"squats_technique_correctness\": \"correct\", \"squats_feedback\": {\"right\": \"The segment provides a detailed, step-by-step guide on how to perform a squat correctly, including specific instructions on foot placement, arm positioning, weight distribution, and movement.\", \"wrong\": null, \"correction\": null}}']}}\n" - ] - } - ], - "source": [ - "thread = {\"configurable\": {\"thread_id\": \"1\"}}\n", - "for s in graph.stream(\n", - " {\n", - " \"task\": \"i wanna teach people how to do squats\",\n", - " \"clip_text_prompts\": [\"person doing squats\"],\n", - " },\n", - " thread,\n", - "):\n", - " if \"download\" in s:\n", - " print(\"dowload happened\")\n", - " elif \"extract_clues\" in s:\n", - " print(\"extract_clues happened\")\n", - " else:\n", - " print(s)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'task': 'i wanna teach people how to do squats',\n", - " 'search_queries': ['how to do squats', 'squat exercise tutorial'],\n", - " 'video_ids': ['xqvCmoLULNY', 'IB_icWRzi4E'],\n", - " 'video_infos': [VideoInfo(video_id='xqvCmoLULNY', url='https://www.youtube.com/watch?v=xqvCmoLULNY', relative_video_path='videos/xqvCmoLULNY.mp4', subs=\"WEBVTT\\nKind: captions\\nLanguage: en\\n\\n00:00:00.160 --> 00:00:01.829 align:start position:0%\\n \\nlet's<00:00:00.399> learn<00:00:00.560> how<00:00:00.719> to<00:00:00.880> properly<00:00:01.280> perform<00:00:01.760> a\\n\\n00:00:01.829 --> 00:00:01.839 align:start position:0%\\nlet's learn how to properly perform a\\n \\n\\n00:00:01.839 --> 00:00:02.790 align:start position:0%\\nlet's learn how to properly perform a\\nsquat\\n\\n00:00:02.790 --> 00:00:02.800 align:start position:0%\\nsquat\\n \\n\\n00:00:02.800 --> 00:00:04.470 align:start position:0%\\nsquat\\nstart<00:00:03.120> with<00:00:03.199> your<00:00:03.360> feet<00:00:03.679> slightly<00:00:04.080> wider<00:00:04.319> than\\n\\n00:00:04.470 --> 00:00:04.480 align:start position:0%\\nstart with your feet slightly wider than\\n \\n\\n00:00:04.480 --> 00:00:06.389 align:start position:0%\\nstart with your feet slightly wider than\\nshoulder<00:00:04.799> width<00:00:05.120> apart\\n\\n00:00:06.389 --> 00:00:06.399 align:start position:0%\\nshoulder width apart\\n \\n\\n00:00:06.399 --> 00:00:09.190 align:start position:0%\\nshoulder width apart\\ncross<00:00:06.799> your<00:00:07.040> arms<00:00:07.440> in<00:00:07.759> front\\n\\n00:00:09.190 --> 00:00:09.200 align:start position:0%\\ncross your arms in front\\n \\n\\n00:00:09.200 --> 00:00:11.270 align:start position:0%\\ncross your arms in front\\nso<00:00:09.440> touch<00:00:09.679> your<00:00:09.920> right<00:00:10.240> hand<00:00:10.559> to<00:00:10.719> your<00:00:10.960> left\\n\\n00:00:11.270 --> 00:00:11.280 align:start position:0%\\nso touch your right hand to your left\\n \\n\\n00:00:11.280 --> 00:00:13.350 align:start position:0%\\nso touch your right hand to your left\\nshoulder<00:00:12.080> and<00:00:12.320> vice<00:00:12.559> versa<00:00:13.040> pointing<00:00:13.280> your\\n\\n00:00:13.350 --> 00:00:13.360 align:start position:0%\\nshoulder and vice versa pointing your\\n \\n\\n00:00:13.360 --> 00:00:15.190 align:start position:0%\\nshoulder and vice versa pointing your\\nelbows<00:00:13.679> straight<00:00:13.920> ahead<00:00:14.559> now<00:00:14.719> from<00:00:14.960> here\\n\\n00:00:15.190 --> 00:00:15.200 align:start position:0%\\nelbows straight ahead now from here\\n \\n\\n00:00:15.200 --> 00:00:17.109 align:start position:0%\\nelbows straight ahead now from here\\nshift<00:00:15.440> your<00:00:15.679> weight<00:00:16.160> to<00:00:16.320> the<00:00:16.480> ball<00:00:16.720> of<00:00:16.880> your\\n\\n00:00:17.109 --> 00:00:17.119 align:start position:0%\\nshift your weight to the ball of your\\n \\n\\n00:00:17.119 --> 00:00:18.230 align:start position:0%\\nshift your weight to the ball of your\\nfeet\\n\\n00:00:18.230 --> 00:00:18.240 align:start position:0%\\nfeet\\n \\n\\n00:00:18.240 --> 00:00:21.109 align:start position:0%\\nfeet\\nand<00:00:18.400> bend<00:00:18.720> your<00:00:18.880> knees\\n\\n00:00:21.109 --> 00:00:21.119 align:start position:0%\\nand bend your knees\\n \\n\\n00:00:21.119 --> 00:00:23.189 align:start position:0%\\nand bend your knees\\nget<00:00:21.359> as<00:00:21.520> close<00:00:21.680> to<00:00:21.840> 90<00:00:22.160> degrees<00:00:22.480> as<00:00:22.640> you<00:00:22.800> can\\n\\n00:00:23.189 --> 00:00:23.199 align:start position:0%\\nget as close to 90 degrees as you can\\n \\n\\n00:00:23.199 --> 00:00:25.830 align:start position:0%\\nget as close to 90 degrees as you can\\nlooking<00:00:23.519> straight<00:00:23.840> ahead<00:00:24.480> and<00:00:24.720> from<00:00:24.960> here\\n\\n00:00:25.830 --> 00:00:25.840 align:start position:0%\\nlooking straight ahead and from here\\n \\n\\n00:00:25.840 --> 00:00:29.109 align:start position:0%\\nlooking straight ahead and from here\\npush<00:00:26.160> back<00:00:26.400> up<00:00:26.480> to<00:00:26.640> the<00:00:26.720> starting<00:00:27.039> position\\n\\n00:00:29.109 --> 00:00:29.119 align:start position:0%\\npush back up to the starting position\\n \\n\\n00:00:29.119 --> 00:00:30.150 align:start position:0%\\npush back up to the starting position\\nthis<00:00:29.359> is<00:00:29.439> going<00:00:29.599> to<00:00:29.679> be<00:00:29.760> great<00:00:30.000> for\\n\\n00:00:30.150 --> 00:00:30.160 align:start position:0%\\nthis is going to be great for\\n \\n\\n00:00:30.160 --> 00:00:31.830 align:start position:0%\\nthis is going to be great for\\nstrengthening\\n\\n00:00:31.830 --> 00:00:31.840 align:start position:0%\\nstrengthening\\n \\n\\n00:00:31.840 --> 00:00:34.069 align:start position:0%\\nstrengthening\\nyour<00:00:32.079> thighs<00:00:32.480> or<00:00:32.559> your<00:00:32.719> quadriceps<00:00:33.680> as<00:00:33.840> well\\n\\n00:00:34.069 --> 00:00:34.079 align:start position:0%\\nyour thighs or your quadriceps as well\\n \\n\\n00:00:34.079 --> 00:00:36.150 align:start position:0%\\nyour thighs or your quadriceps as well\\nas<00:00:34.239> your<00:00:34.480> butt<00:00:34.719> or<00:00:34.800> your<00:00:34.960> glutes\\n\\n00:00:36.150 --> 00:00:36.160 align:start position:0%\\nas your butt or your glutes\\n \\n\\n00:00:36.160 --> 00:00:37.750 align:start position:0%\\nas your butt or your glutes\\nand<00:00:36.399> it's<00:00:36.559> also<00:00:36.800> going<00:00:36.800> to<00:00:36.880> be<00:00:37.040> great<00:00:37.280> to<00:00:37.440> work\\n\\n00:00:37.750 --> 00:00:37.760 align:start position:0%\\nand it's also going to be great to work\\n \\n\\n00:00:37.760 --> 00:00:41.510 align:start position:0%\\nand it's also going to be great to work\\non<00:00:38.079> your<00:00:38.320> posture\\n\\n00:00:41.510 --> 00:00:41.520 align:start position:0%\\n \\n \\n\\n00:00:41.520 --> 00:00:45.840 align:start position:0%\\n \\nthat's<00:00:41.760> how<00:00:41.920> to<00:00:42.000> properly<00:00:42.399> perform<00:00:42.879> a<00:00:42.960> squat\\n\\n\", transcript=\"00:00:00\\nlet's learn how to properly perform a\\n00:00:01\\nsquat\\n00:00:02\\nstart with your feet slightly wider than\\n00:00:04\\nshoulder width apart\\n00:00:06\\ncross your arms in front\\n00:00:09\\nso touch your right hand to your left\\n00:00:11\\nshoulder and vice versa pointing your\\n00:00:13\\nelbows straight ahead now from here\\n00:00:15\\nshift your weight to the ball of your\\n00:00:17\\nfeet\\n00:00:18\\nand bend your knees\\n00:00:21\\nget as close to 90 degrees as you can\\n00:00:23\\nlooking straight ahead and from here\\n00:00:25\\npush back up to the starting position\\n00:00:29\\nthis is going to be great for\\n00:00:30\\nstrengthening\\n00:00:31\\nyour thighs or your quadriceps as well\\n00:00:34\\nas your butt or your glutes\\n00:00:36\\nand it's also going to be great to work\\n00:00:37\\non your posture\\n00:00:41\\nthat's how to properly perform a squat\"),\n", - " VideoInfo(video_id='IB_icWRzi4E', url='https://www.youtube.com/watch?v=IB_icWRzi4E', relative_video_path='videos/IB_icWRzi4E.mp4', subs=\"WEBVTT\\nKind: captions\\nLanguage: en\\n\\n00:00:02.419 --> 00:00:05.150 align:start position:0%\\n \\nhi<00:00:03.419> I'm<00:00:03.810> Roger<00:00:03.990> Frampton<00:00:04.319> and<00:00:04.770> I'm<00:00:04.859> a<00:00:04.950> movement\\n\\n00:00:05.150 --> 00:00:05.160 align:start position:0%\\nhi I'm Roger Frampton and I'm a movement\\n \\n\\n00:00:05.160 --> 00:00:07.400 align:start position:0%\\nhi I'm Roger Frampton and I'm a movement\\ncoach<00:00:05.520> from<00:00:05.910> London<00:00:06.299> and<00:00:06.509> I'm<00:00:06.779> author<00:00:07.049> of<00:00:07.230> the\\n\\n00:00:07.400 --> 00:00:07.410 align:start position:0%\\ncoach from London and I'm author of the\\n \\n\\n00:00:07.410 --> 00:00:10.310 align:start position:0%\\ncoach from London and I'm author of the\\nbook<00:00:07.649> the<00:00:08.010> flexible<00:00:08.400> body<00:00:08.660> so<00:00:09.660> this<00:00:09.840> position\\n\\n00:00:10.310 --> 00:00:10.320 align:start position:0%\\nbook the flexible body so this position\\n \\n\\n00:00:10.320 --> 00:00:11.780 align:start position:0%\\nbook the flexible body so this position\\nis<00:00:10.860> the<00:00:11.099> squat\\n\\n00:00:11.780 --> 00:00:11.790 align:start position:0%\\nis the squat\\n \\n\\n00:00:11.790 --> 00:00:13.459 align:start position:0%\\nis the squat\\nmost<00:00:12.150> people<00:00:12.480> when<00:00:12.599> I<00:00:12.660> talk<00:00:12.870> about<00:00:12.990> the<00:00:13.200> squat\\n\\n00:00:13.459 --> 00:00:13.469 align:start position:0%\\nmost people when I talk about the squat\\n \\n\\n00:00:13.469 --> 00:00:14.780 align:start position:0%\\nmost people when I talk about the squat\\nthinking<00:00:13.799> when<00:00:13.920> I'm<00:00:14.009> forming<00:00:14.340> up<00:00:14.460> and<00:00:14.639> down\\n\\n00:00:14.780 --> 00:00:14.790 align:start position:0%\\nthinking when I'm forming up and down\\n \\n\\n00:00:14.790 --> 00:00:16.790 align:start position:0%\\nthinking when I'm forming up and down\\nexercise<00:00:15.299> it<00:00:15.509> works<00:00:15.750> your<00:00:15.960> legs<00:00:16.080> and<00:00:16.350> bar\\n\\n00:00:16.790 --> 00:00:16.800 align:start position:0%\\nexercise it works your legs and bar\\n \\n\\n00:00:16.800 --> 00:00:19.670 align:start position:0%\\nexercise it works your legs and bar\\nwell<00:00:17.640> the<00:00:17.789> squat<00:00:18.090> is<00:00:18.210> actually<00:00:18.570> a<00:00:18.680> position\\n\\n00:00:19.670 --> 00:00:19.680 align:start position:0%\\nwell the squat is actually a position\\n \\n\\n00:00:19.680 --> 00:00:22.609 align:start position:0%\\nwell the squat is actually a position\\nthat<00:00:20.520> we're<00:00:20.730> designed<00:00:21.029> to<00:00:21.180> defecate<00:00:21.660> in<00:00:21.689> every\\n\\n00:00:22.609 --> 00:00:22.619 align:start position:0%\\nthat we're designed to defecate in every\\n \\n\\n00:00:22.619 --> 00:00:25.810 align:start position:0%\\nthat we're designed to defecate in every\\nkid<00:00:22.830> sits<00:00:23.670> and<00:00:24.000> rests<00:00:24.600> in<00:00:24.900> this<00:00:25.140> position<00:00:25.560> and\\n\\n00:00:25.810 --> 00:00:25.820 align:start position:0%\\nkid sits and rests in this position and\\n \\n\\n00:00:25.820 --> 00:00:28.700 align:start position:0%\\nkid sits and rests in this position and\\nif<00:00:26.820> we<00:00:26.910> look<00:00:27.119> at<00:00:27.240> Western<00:00:27.480> people<00:00:28.050> I<00:00:28.230> think\\n\\n00:00:28.700 --> 00:00:28.710 align:start position:0%\\nif we look at Western people I think\\n \\n\\n00:00:28.710 --> 00:00:31.550 align:start position:0%\\nif we look at Western people I think\\nmost<00:00:28.890> people<00:00:29.400> end<00:00:30.029> up<00:00:30.240> sitting<00:00:30.510> up<00:00:31.199> on<00:00:31.410> their\\n\\n00:00:31.550 --> 00:00:31.560 align:start position:0%\\nmost people end up sitting up on their\\n \\n\\n00:00:31.560 --> 00:00:33.740 align:start position:0%\\nmost people end up sitting up on their\\ntoes<00:00:31.740> more<00:00:32.070> in<00:00:32.219> this<00:00:32.309> position<00:00:32.759> while<00:00:33.660> I'm\\n\\n00:00:33.740 --> 00:00:33.750 align:start position:0%\\ntoes more in this position while I'm\\n \\n\\n00:00:33.750 --> 00:00:35.510 align:start position:0%\\ntoes more in this position while I'm\\nadvising<00:00:34.290> is<00:00:34.410> that<00:00:34.530> you<00:00:34.649> get<00:00:34.890> this<00:00:35.070> position\\n\\n00:00:35.510 --> 00:00:35.520 align:start position:0%\\nadvising is that you get this position\\n \\n\\n00:00:35.520 --> 00:00:38.150 align:start position:0%\\nadvising is that you get this position\\nback<00:00:35.730> not<00:00:36.510> for<00:00:36.719> an<00:00:36.809> exercise<00:00:37.260> necessarily<00:00:38.010> not\\n\\n00:00:38.150 --> 00:00:38.160 align:start position:0%\\nback not for an exercise necessarily not\\n \\n\\n00:00:38.160 --> 00:00:40.610 align:start position:0%\\nback not for an exercise necessarily not\\nbecause<00:00:38.489> of<00:00:38.520> fitness<00:00:38.969> or<00:00:39.149> to<00:00:39.270> be<00:00:39.420> fitter<00:00:39.719> but\\n\\n00:00:40.610 --> 00:00:40.620 align:start position:0%\\nbecause of fitness or to be fitter but\\n \\n\\n00:00:40.620 --> 00:00:43.430 align:start position:0%\\nbecause of fitness or to be fitter but\\njust<00:00:40.920> because<00:00:41.520> you're<00:00:41.730> designed<00:00:42.120> to<00:00:43.110> do<00:00:43.260> it\\n\\n00:00:43.430 --> 00:00:43.440 align:start position:0%\\njust because you're designed to do it\\n \\n\\n00:00:43.440 --> 00:00:45.350 align:start position:0%\\njust because you're designed to do it\\nthis<00:00:43.710> here<00:00:43.980> is<00:00:44.219> just<00:00:44.399> the<00:00:44.610> human<00:00:44.789> resting\\n\\n00:00:45.350 --> 00:00:45.360 align:start position:0%\\nthis here is just the human resting\\n \\n\\n00:00:45.360 --> 00:00:48.229 align:start position:0%\\nthis here is just the human resting\\nposition<00:00:45.780> so<00:00:46.230> when<00:00:46.379> I<00:00:46.410> run<00:00:46.649> classes<00:00:47.250> people\\n\\n00:00:48.229 --> 00:00:48.239 align:start position:0%\\nposition so when I run classes people\\n \\n\\n00:00:48.239 --> 00:00:50.119 align:start position:0%\\nposition so when I run classes people\\nalways<00:00:48.480> talk<00:00:49.140> about<00:00:49.170> the<00:00:49.469> squat<00:00:49.770> how<00:00:49.980> they\\n\\n00:00:50.119 --> 00:00:50.129 align:start position:0%\\nalways talk about the squat how they\\n \\n\\n00:00:50.129 --> 00:00:51.590 align:start position:0%\\nalways talk about the squat how they\\ncan't<00:00:50.399> quite<00:00:50.610> get<00:00:50.789> their<00:00:50.969> heel<00:00:51.149> down<00:00:51.390> on<00:00:51.510> the\\n\\n00:00:51.590 --> 00:00:51.600 align:start position:0%\\ncan't quite get their heel down on the\\n \\n\\n00:00:51.600 --> 00:00:53.569 align:start position:0%\\ncan't quite get their heel down on the\\nfloor<00:00:51.840> now<00:00:52.649> the<00:00:52.739> reason<00:00:53.070> for<00:00:53.250> that<00:00:53.309> is<00:00:53.550> because\\n\\n00:00:53.569 --> 00:00:53.579 align:start position:0%\\nfloor now the reason for that is because\\n \\n\\n00:00:53.579 --> 00:00:55.220 align:start position:0%\\nfloor now the reason for that is because\\nsince<00:00:54.059> about<00:00:54.239> the<00:00:54.300> age<00:00:54.420> of<00:00:54.600> four<00:00:54.840> years<00:00:55.050> old\\n\\n00:00:55.220 --> 00:00:55.230 align:start position:0%\\nsince about the age of four years old\\n \\n\\n00:00:55.230 --> 00:00:58.099 align:start position:0%\\nsince about the age of four years old\\nyou've<00:00:55.800> been<00:00:56.010> wearing<00:00:56.219> shoes<00:00:56.809> we<00:00:57.809> are<00:00:57.960> an\\n\\n00:00:58.099 --> 00:00:58.109 align:start position:0%\\nyou've been wearing shoes we are an\\n \\n\\n00:00:58.109 --> 00:00:58.549 align:start position:0%\\nyou've been wearing shoes we are an\\nanimal\\n\\n00:00:58.549 --> 00:00:58.559 align:start position:0%\\nanimal\\n \\n\\n00:00:58.559 --> 00:01:00.229 align:start position:0%\\nanimal\\nwe're<00:00:59.070> designed<00:00:59.370> to<00:00:59.460> walk<00:00:59.640> around<00:00:59.789> on<00:01:00.059> bare\\n\\n00:01:00.229 --> 00:01:00.239 align:start position:0%\\nwe're designed to walk around on bare\\n \\n\\n00:01:00.239 --> 00:01:02.510 align:start position:0%\\nwe're designed to walk around on bare\\nfeet<00:01:00.270> and<00:01:00.690> the<00:01:01.109> reason<00:01:01.469> that<00:01:01.559> we'll<00:01:01.710> lose<00:01:01.949> our\\n\\n00:01:02.510 --> 00:01:02.520 align:start position:0%\\nfeet and the reason that we'll lose our\\n \\n\\n00:01:02.520 --> 00:01:05.210 align:start position:0%\\nfeet and the reason that we'll lose our\\nsquat<00:01:03.090> mobility<00:01:03.570> is<00:01:04.229> simply<00:01:04.619> because<00:01:04.920> we<00:01:05.040> come\\n\\n00:01:05.210 --> 00:01:05.220 align:start position:0%\\nsquat mobility is simply because we come\\n \\n\\n00:01:05.220 --> 00:01:07.370 align:start position:0%\\nsquat mobility is simply because we come\\nto<00:01:05.369> become<00:01:05.610> tight<00:01:05.970> in<00:01:06.150> our<00:01:06.210> ankles<00:01:06.659> and\\n\\n00:01:07.370 --> 00:01:07.380 align:start position:0%\\nto become tight in our ankles and\\n \\n\\n00:01:07.380 --> 00:01:09.590 align:start position:0%\\nto become tight in our ankles and\\ntherefore<00:01:08.070> you<00:01:08.670> can't<00:01:08.970> get<00:01:09.090> all<00:01:09.210> the<00:01:09.270> way<00:01:09.360> down\\n\\n00:01:09.590 --> 00:01:09.600 align:start position:0%\\ntherefore you can't get all the way down\\n \\n\\n00:01:09.600 --> 00:01:11.990 align:start position:0%\\ntherefore you can't get all the way down\\nto<00:01:09.780> the<00:01:09.869> bottom<00:01:10.010> now<00:01:11.010> all<00:01:11.369> you<00:01:11.490> need<00:01:11.640> to<00:01:11.729> do<00:01:11.880> is\\n\\n00:01:11.990 --> 00:01:12.000 align:start position:0%\\nto the bottom now all you need to do is\\n \\n\\n00:01:12.000 --> 00:01:14.030 align:start position:0%\\nto the bottom now all you need to do is\\nif<00:01:12.240> you<00:01:12.390> put<00:01:12.570> your<00:01:12.600> heels<00:01:12.990> on<00:01:13.200> a<00:01:13.229> yoga<00:01:13.590> block<00:01:13.799> or\\n\\n00:01:14.030 --> 00:01:14.040 align:start position:0%\\nif you put your heels on a yoga block or\\n \\n\\n00:01:14.040 --> 00:01:16.580 align:start position:0%\\nif you put your heels on a yoga block or\\na<00:01:14.100> book<00:01:14.400> you're<00:01:15.000> in<00:01:15.119> that<00:01:15.270> position<00:01:15.590> rather\\n\\n00:01:16.580 --> 00:01:16.590 align:start position:0%\\na book you're in that position rather\\n \\n\\n00:01:16.590 --> 00:01:19.070 align:start position:0%\\na book you're in that position rather\\nthan<00:01:16.799> that<00:01:16.890> position<00:01:17.430> that<00:01:18.330> will<00:01:18.600> take<00:01:18.900> away\\n\\n00:01:19.070 --> 00:01:19.080 align:start position:0%\\nthan that position that will take away\\n \\n\\n00:01:19.080 --> 00:01:21.050 align:start position:0%\\nthan that position that will take away\\nyour<00:01:19.320> ankle<00:01:19.680> mobility<00:01:19.770> and<00:01:20.280> allow<00:01:20.549> you<00:01:20.610> to<00:01:20.909> get\\n\\n00:01:21.050 --> 00:01:21.060 align:start position:0%\\nyour ankle mobility and allow you to get\\n \\n\\n00:01:21.060 --> 00:01:23.149 align:start position:0%\\nyour ankle mobility and allow you to get\\nall<00:01:21.210> the<00:01:21.330> way<00:01:21.450> down<00:01:21.689> now<00:01:22.500> you<00:01:22.560> can<00:01:22.770> use<00:01:22.920> a<00:01:22.950> yoga\\n\\n00:01:23.149 --> 00:01:23.159 align:start position:0%\\nall the way down now you can use a yoga\\n \\n\\n00:01:23.159 --> 00:01:24.950 align:start position:0%\\nall the way down now you can use a yoga\\nblock<00:01:23.490> or<00:01:23.640> a<00:01:23.670> book<00:01:23.880> or<00:01:24.119> two<00:01:24.299> or<00:01:24.450> three<00:01:24.479> of<00:01:24.720> them\\n\\n00:01:24.950 --> 00:01:24.960 align:start position:0%\\nblock or a book or two or three of them\\n \\n\\n00:01:24.960 --> 00:01:27.590 align:start position:0%\\nblock or a book or two or three of them\\nif<00:01:25.830> you're<00:01:26.009> a<00:01:26.070> girl<00:01:26.400> squatting<00:01:26.939> in<00:01:27.060> heels<00:01:27.299> it's\\n\\n00:01:27.590 --> 00:01:27.600 align:start position:0%\\nif you're a girl squatting in heels it's\\n \\n\\n00:01:27.600 --> 00:01:29.600 align:start position:0%\\nif you're a girl squatting in heels it's\\nso<00:01:27.869> much<00:01:28.049> easier<00:01:28.080> and<00:01:28.650> squatting<00:01:29.189> flat<00:01:29.460> on<00:01:29.549> the\\n\\n00:01:29.600 --> 00:01:29.610 align:start position:0%\\nso much easier and squatting flat on the\\n \\n\\n00:01:29.610 --> 00:01:31.490 align:start position:0%\\nso much easier and squatting flat on the\\nfloor<00:01:29.820> so<00:01:30.479> what<00:01:30.600> you<00:01:30.720> do<00:01:30.840> is<00:01:30.960> you<00:01:31.079> start<00:01:31.320> on\\n\\n00:01:31.490 --> 00:01:31.500 align:start position:0%\\nfloor so what you do is you start on\\n \\n\\n00:01:31.500 --> 00:01:33.319 align:start position:0%\\nfloor so what you do is you start on\\nwhatever<00:01:31.710> angle<00:01:32.220> you're<00:01:32.400> comfortable<00:01:32.970> in<00:01:33.090> and\\n\\n00:01:33.319 --> 00:01:33.329 align:start position:0%\\nwhatever angle you're comfortable in and\\n \\n\\n00:01:33.329 --> 00:01:36.260 align:start position:0%\\nwhatever angle you're comfortable in and\\nyou<00:01:34.110> work<00:01:34.350> your<00:01:34.560> way<00:01:34.710> down<00:01:34.740> to<00:01:35.549> becoming<00:01:36.000> flat\\n\\n00:01:36.260 --> 00:01:36.270 align:start position:0%\\nyou work your way down to becoming flat\\n \\n\\n00:01:36.270 --> 00:01:39.679 align:start position:0%\\nyou work your way down to becoming flat\\non<00:01:36.360> the<00:01:36.450> floor<00:01:36.659> so<00:01:37.409> really<00:01:37.740> a<00:01:37.770> squat<00:01:38.490> is<00:01:38.939> just<00:01:39.509> a\\n\\n00:01:39.679 --> 00:01:39.689 align:start position:0%\\non the floor so really a squat is just a\\n \\n\\n00:01:39.689 --> 00:01:42.469 align:start position:0%\\non the floor so really a squat is just a\\ntest<00:01:40.079> of<00:01:40.350> your<00:01:41.070> ankles<00:01:41.549> if<00:01:41.880> you've<00:01:42.090> got<00:01:42.299> good\\n\\n00:01:42.469 --> 00:01:42.479 align:start position:0%\\ntest of your ankles if you've got good\\n \\n\\n00:01:42.479 --> 00:01:45.499 align:start position:0%\\ntest of your ankles if you've got good\\nankles<00:01:42.869> you're<00:01:43.590> great<00:01:43.890> at<00:01:44.070> squatting<00:01:44.549> and<00:01:44.759> if\\n\\n00:01:45.499 --> 00:01:45.509 align:start position:0%\\nankles you're great at squatting and if\\n \\n\\n00:01:45.509 --> 00:01:47.420 align:start position:0%\\nankles you're great at squatting and if\\nyour<00:01:45.659> ankles<00:01:46.020> have<00:01:46.110> become<00:01:46.409> tight<00:01:46.680> then<00:01:47.009> squat\\n\\n00:01:47.420 --> 00:01:47.430 align:start position:0%\\nyour ankles have become tight then squat\\n \\n\\n00:01:47.430 --> 00:01:49.609 align:start position:0%\\nyour ankles have become tight then squat\\nis<00:01:47.549> really<00:01:47.820> difficult<00:01:48.060> and<00:01:48.450> hard<00:01:48.689> but<00:01:49.350> like\\n\\n00:01:49.609 --> 00:01:49.619 align:start position:0%\\nis really difficult and hard but like\\n \\n\\n00:01:49.619 --> 00:01:51.499 align:start position:0%\\nis really difficult and hard but like\\nevery<00:01:50.070> other<00:01:50.220> muscle<00:01:50.490> and<00:01:50.880> joint<00:01:51.180> in<00:01:51.390> the<00:01:51.479> body\\n\\n00:01:51.499 --> 00:01:51.509 align:start position:0%\\nevery other muscle and joint in the body\\n \\n\\n00:01:51.509 --> 00:01:54.080 align:start position:0%\\nevery other muscle and joint in the body\\nis<00:01:51.960> trainable<00:01:52.680> and<00:01:52.920> you've<00:01:53.399> got<00:01:53.549> time<00:01:53.820> to<00:01:53.970> get\\n\\n00:01:54.080 --> 00:01:54.090 align:start position:0%\\nis trainable and you've got time to get\\n \\n\\n00:01:54.090 --> 00:01:56.870 align:start position:0%\\nis trainable and you've got time to get\\nit<00:01:54.210> back<00:01:54.710> if<00:01:55.710> you<00:01:55.770> do<00:01:56.070> something<00:01:56.369> for<00:01:56.549> 10\\n\\n00:01:56.870 --> 00:01:56.880 align:start position:0%\\nit back if you do something for 10\\n \\n\\n00:01:56.880 --> 00:01:58.850 align:start position:0%\\nit back if you do something for 10\\nminutes<00:01:57.149> every<00:01:57.360> day<00:01:57.540> kind<00:01:58.259> of<00:01:58.350> like<00:01:58.560> brushing\\n\\n00:01:58.850 --> 00:01:58.860 align:start position:0%\\nminutes every day kind of like brushing\\n \\n\\n00:01:58.860 --> 00:02:01.160 align:start position:0%\\nminutes every day kind of like brushing\\nyour<00:01:59.070> teeth<00:01:59.280> it<00:02:00.000> gets<00:02:00.240> into<00:02:00.509> a<00:02:00.540> habit<00:02:00.930> and<00:02:01.110> you\\n\\n00:02:01.160 --> 00:02:01.170 align:start position:0%\\nyour teeth it gets into a habit and you\\n \\n\\n00:02:01.170 --> 00:02:02.719 align:start position:0%\\nyour teeth it gets into a habit and you\\ngo<00:02:01.320> you<00:02:01.380> get<00:02:01.619> up<00:02:01.740> you<00:02:01.890> brush<00:02:02.070> your<00:02:02.219> teeth<00:02:02.369> go<00:02:02.670> to\\n\\n00:02:02.719 --> 00:02:02.729 align:start position:0%\\ngo you get up you brush your teeth go to\\n \\n\\n00:02:02.729 --> 00:02:05.270 align:start position:0%\\ngo you get up you brush your teeth go to\\nveggie<00:02:03.000> brush<00:02:03.240> your<00:02:03.390> teeth<00:02:03.540> so<00:02:04.290> by<00:02:04.740> doing<00:02:04.799> 10\\n\\n00:02:05.270 --> 00:02:05.280 align:start position:0%\\nveggie brush your teeth so by doing 10\\n \\n\\n00:02:05.280 --> 00:02:06.080 align:start position:0%\\nveggie brush your teeth so by doing 10\\nminutes<00:02:05.520> every<00:02:05.820> day\\n\\n00:02:06.080 --> 00:02:06.090 align:start position:0%\\nminutes every day\\n \\n\\n00:02:06.090 --> 00:02:08.690 align:start position:0%\\nminutes every day\\nyou'll<00:02:06.810> get<00:02:06.990> into<00:02:07.109> the<00:02:07.259> habit<00:02:07.469> by<00:02:08.399> playing\\n\\n00:02:08.690 --> 00:02:08.700 align:start position:0%\\nyou'll get into the habit by playing\\n \\n\\n00:02:08.700 --> 00:02:10.759 align:start position:0%\\nyou'll get into the habit by playing\\nusing<00:02:09.420> just<00:02:09.750> that<00:02:09.899> short<00:02:10.140> of<00:02:10.289> a<00:02:10.379> short<00:02:10.590> amount\\n\\n00:02:10.759 --> 00:02:10.769 align:start position:0%\\nusing just that short of a short amount\\n \\n\\n00:02:10.769 --> 00:02:12.559 align:start position:0%\\nusing just that short of a short amount\\nof<00:02:10.920> time<00:02:11.129> you<00:02:11.310> can<00:02:11.340> really<00:02:11.700> focus<00:02:11.910> on<00:02:12.239> how<00:02:12.390> your\\n\\n00:02:12.559 --> 00:02:12.569 align:start position:0%\\nof time you can really focus on how your\\n \\n\\n00:02:12.569 --> 00:02:13.520 align:start position:0%\\nof time you can really focus on how your\\nbody's<00:02:12.870> anemic\\n\\n00:02:13.520 --> 00:02:13.530 align:start position:0%\\nbody's anemic\\n \\n\\n00:02:13.530 --> 00:02:15.050 align:start position:0%\\nbody's anemic\\nnow<00:02:14.010> some<00:02:14.220> of<00:02:14.280> the<00:02:14.340> exercises<00:02:14.790> that<00:02:14.940> I\\n\\n00:02:15.050 --> 00:02:15.060 align:start position:0%\\nnow some of the exercises that I\\n \\n\\n00:02:15.060 --> 00:02:16.699 align:start position:0%\\nnow some of the exercises that I\\nrecommend<00:02:15.540> that<00:02:15.599> you<00:02:15.780> do<00:02:15.989> within<00:02:16.290> those<00:02:16.440> 10\\n\\n00:02:16.699 --> 00:02:16.709 align:start position:0%\\nrecommend that you do within those 10\\n \\n\\n00:02:16.709 --> 00:02:20.270 align:start position:0%\\nrecommend that you do within those 10\\nminutes<00:02:16.860> first<00:02:17.849> one<00:02:18.120> is<00:02:18.330> the<00:02:18.720> squat<00:02:19.280> sitting\\n\\n00:02:20.270 --> 00:02:20.280 align:start position:0%\\nminutes first one is the squat sitting\\n \\n\\n00:02:20.280 --> 00:02:22.759 align:start position:0%\\nminutes first one is the squat sitting\\nin<00:02:20.489> a<00:02:20.580> squat<00:02:20.940> not<00:02:21.480> bobbing<00:02:22.019> up<00:02:22.080> and<00:02:22.230> down<00:02:22.349> just\\n\\n00:02:22.759 --> 00:02:22.769 align:start position:0%\\nin a squat not bobbing up and down just\\n \\n\\n00:02:22.769 --> 00:02:24.589 align:start position:0%\\nin a squat not bobbing up and down just\\nbeing<00:02:23.010> able<00:02:23.190> to<00:02:23.370> function<00:02:24.000> to<00:02:24.209> sit<00:02:24.420> in<00:02:24.540> the\\n\\n00:02:24.589 --> 00:02:24.599 align:start position:0%\\nbeing able to function to sit in the\\n \\n\\n00:02:24.599 --> 00:02:27.229 align:start position:0%\\nbeing able to function to sit in the\\nsquat<00:02:24.930> something<00:02:25.709> again<00:02:26.010> is<00:02:26.250> innate<00:02:26.400> to<00:02:26.819> us<00:02:26.849> as\\n\\n00:02:27.229 --> 00:02:27.239 align:start position:0%\\nsquat something again is innate to us as\\n \\n\\n00:02:27.239 --> 00:02:28.729 align:start position:0%\\nsquat something again is innate to us as\\nhuman<00:02:27.569> beings<00:02:27.599> and<00:02:28.049> something<00:02:28.440> that<00:02:28.530> you<00:02:28.650> are\\n\\n00:02:28.729 --> 00:02:28.739 align:start position:0%\\nhuman beings and something that you are\\n \\n\\n00:02:28.739 --> 00:02:31.670 align:start position:0%\\nhuman beings and something that you are\\nable<00:02:28.890> to<00:02:29.220> do<00:02:29.910> as<00:02:30.090> a<00:02:30.120> child<00:02:30.209> so<00:02:30.870> spend<00:02:31.440> 10\\n\\n00:02:31.670 --> 00:02:31.680 align:start position:0%\\nable to do as a child so spend 10\\n \\n\\n00:02:31.680 --> 00:02:34.640 align:start position:0%\\nable to do as a child so spend 10\\nminutes<00:02:32.239> every<00:02:33.239> day<00:02:33.450> getting<00:02:34.110> your<00:02:34.410> squat\\n\\n00:02:34.640 --> 00:02:34.650 align:start position:0%\\nminutes every day getting your squat\\n \\n\\n00:02:34.650 --> 00:02:37.360 align:start position:0%\\nminutes every day getting your squat\\nback\\n\\n00:02:37.360 --> 00:02:37.370 align:start position:0%\\n \\n \\n\\n00:02:37.370 --> 00:02:51.610 align:start position:0%\\n \\n[Music]\\n\\n00:02:51.610 --> 00:02:51.620 align:start position:0%\\n \\n \\n\\n00:02:51.620 --> 00:02:56.310 align:start position:0%\\n \\n[Music]\\n\\n\", transcript=\"00:00:02\\nhi I'm Roger Frampton and I'm a movement\\n00:00:05\\ncoach from London and I'm author of the\\n00:00:07\\nbook the flexible body so this position\\n00:00:10\\nis the squat\\n00:00:11\\nmost people when I talk about the squat\\n00:00:13\\nthinking when I'm forming up and down\\n00:00:14\\nexercise it works your legs and bar\\n00:00:16\\nwell the squat is actually a position\\n00:00:19\\nthat we're designed to defecate in every\\n00:00:22\\nkid sits and rests in this position and\\n00:00:25\\nif we look at Western people I think\\n00:00:28\\nmost people end up sitting up on their\\n00:00:31\\ntoes more in this position while I'm\\n00:00:33\\nadvising is that you get this position\\n00:00:35\\nback not for an exercise necessarily not\\n00:00:38\\nbecause of fitness or to be fitter but\\n00:00:40\\njust because you're designed to do it\\n00:00:43\\nthis here is just the human resting\\n00:00:45\\nposition so when I run classes people\\n00:00:48\\nalways talk about the squat how they\\n00:00:50\\ncan't quite get their heel down on the\\n00:00:51\\nfloor now the reason for that is because\\n00:00:53\\nsince about the age of four years old\\n00:00:55\\nyou've been wearing shoes we are an\\n00:00:58\\nanimal\\nwe're designed to walk around on bare\\n00:01:00\\nfeet and the reason that we'll lose our\\n00:01:02\\nsquat mobility is simply because we come\\n00:01:05\\nto become tight in our ankles and\\n00:01:07\\ntherefore you can't get all the way down\\n00:01:09\\nto the bottom now all you need to do is\\n00:01:11\\n00:01:12\\nif you put your heels on a yoga block or\\n00:01:14\\na book you're in that position rather\\n00:01:16\\nthan that position that will take away\\n00:01:19\\nyour ankle mobility and allow you to get\\n00:01:21\\nall the way down now you can use a yoga\\n00:01:23\\nblock or a book or two or three of them\\n00:01:24\\nif you're a girl squatting in heels it's\\n00:01:27\\nso much easier and squatting flat on the\\n00:01:29\\nfloor so what you do is you start on\\n00:01:31\\nwhatever angle you're comfortable in and\\n00:01:33\\nyou work your way down to becoming flat\\n00:01:36\\non the floor so really a squat is just a\\n00:01:39\\ntest of your ankles if you've got good\\n00:01:42\\nankles you're great at squatting and if\\n00:01:45\\nyour ankles have become tight then squat\\n00:01:47\\nis really difficult and hard but like\\n00:01:49\\nevery other muscle and joint in the body\\n00:01:51\\nis trainable and you've got time to get\\n00:01:54\\nit back if you do something for 10\\n00:01:56\\nminutes every day kind of like brushing\\n00:01:58\\nyour teeth it gets into a habit and you\\n00:02:01\\ngo you get up you brush your teeth go to\\n00:02:02\\nveggie brush your teeth so by doing 10\\n00:02:05\\nminutes every day\\n00:02:06\\nyou'll get into the habit by playing\\n00:02:08\\nusing just that short of a short amount\\n00:02:10\\nof time you can really focus on how your\\n00:02:12\\nbody's anemic\\n00:02:13\\nnow some of the exercises that I\\n00:02:15\\nrecommend that you do within those 10\\n00:02:16\\nminutes first one is the squat sitting\\n00:02:20\\nin a squat not bobbing up and down just\\n00:02:22\\nbeing able to function to sit in the\\n00:02:24\\nsquat something again is innate to us as\\n00:02:27\\nhuman beings and something that you are\\n00:02:28\\nable to do as a child so spend 10\\n00:02:31\\nminutes every day getting your squat\\n00:02:34\\nback\\n00:02:37\\n[Music]\\n00:02:51\")],\n", - " 'clip_text_prompts': ['person doing squats'],\n", - " 'segment_infos': [SegmentInfo(start_timestamp='00:00:00.000', end_timestamp='00:00:05.000', fps=25.0, video_id='IB_icWRzi4E'),\n", - " SegmentInfo(start_timestamp='00:00:09.000', end_timestamp='00:00:24.000', fps=25.0, video_id='IB_icWRzi4E'),\n", - " SegmentInfo(start_timestamp='00:00:29.000', end_timestamp='00:00:45.000', fps=25.0, video_id='IB_icWRzi4E'),\n", - " SegmentInfo(start_timestamp='00:00:49.000', end_timestamp='00:00:53.000', fps=25.0, video_id='IB_icWRzi4E'),\n", - " SegmentInfo(start_timestamp='00:01:02.000', end_timestamp='00:01:06.000', fps=25.0, video_id='IB_icWRzi4E'),\n", - " SegmentInfo(start_timestamp='00:01:43.000', end_timestamp='00:01:49.000', fps=25.0, video_id='IB_icWRzi4E'),\n", - " SegmentInfo(start_timestamp='00:02:18.000', end_timestamp='00:02:27.000', fps=25.0, video_id='IB_icWRzi4E'),\n", - " SegmentInfo(start_timestamp='00:00:01.000', end_timestamp='00:00:44.000', fps=25.0, video_id='xqvCmoLULNY')],\n", - " 'clues': [SegmentWithClueInfo(start_timestamp='00:00:00.000', end_timestamp='00:00:05.000', segment_annotation=SegmentAnnotation(local_clues=[LocalClue(id='LC1', quote=\"hi I'm Roger Frampton and I'm a movement\", quote_timestamp_start='00:00:02', quote_timestamp_end='00:00:05', clue='Introduction of the speaker, Roger Frampton, who is a movement coach.')], global_clues=[GlobalClue(id='GC1', quote=\"hi I'm Roger Frampton and I'm a movement coach from London and I'm author of the book the flexible body\", quote_timestamp_start='00:00:02', quote_timestamp_end='00:00:07', clue='Introduction of Roger Frampton as a movement coach and author.', relevance_to_segment=\"This provides context about the speaker's expertise and credibility, which is relevant to understanding the importance of the information presented in the segment.\")], logical_inferences=[LogicalInference(id='LI1', description='Introduction of the Speaker', details='The segment serves as an introduction to Roger Frampton, establishing his credentials and setting the stage for the subsequent discussion on squats.')])),\n", - " SegmentWithClueInfo(start_timestamp='00:00:09.000', end_timestamp='00:00:24.000', segment_annotation=SegmentAnnotation(local_clues=[LocalClue(id='LC1', quote='so this position is the squat', quote_timestamp_start='00:00:10', quote_timestamp_end='00:00:11', clue='Introduction of the squat position.'), LocalClue(id='LC2', quote=\"most people when I talk about the squat thinking when I'm forming up and down exercise it works your legs and bar\", quote_timestamp_start='00:00:11', quote_timestamp_end='00:00:16', clue='Common misconception about the squat being just an up-and-down exercise.'), LocalClue(id='LC3', quote=\"well the squat is actually a position that we're designed to defecate in every\", quote_timestamp_start='00:00:16', quote_timestamp_end='00:00:19', clue=\"Explanation of the squat's original purpose for defecation.\"), LocalClue(id='LC4', quote='kid sits and rests in this position and if we look at Western people I think most people end up sitting up on their toes more in this position', quote_timestamp_start='00:00:22', quote_timestamp_end='00:00:25', clue='Comparison of how children and Western adults sit in the squat position.')], global_clues=[GlobalClue(id='GC1', quote=\"so this position is the squat most people when I talk about the squat thinking when I'm forming up and down exercise it works your legs and bar\", quote_timestamp_start='00:00:10', quote_timestamp_end='00:00:16', clue='Introduction of the squat position and common misconceptions.', relevance_to_segment='This provides context for the segment by explaining what the squat is and addressing common misconceptions.'), GlobalClue(id='GC2', quote=\"well the squat is actually a position that we're designed to defecate in every kid sits and rests in this position\", quote_timestamp_start='00:00:16', quote_timestamp_end='00:00:22', clue=\"Explanation of the squat's original purpose and how children naturally sit in this position.\", relevance_to_segment='This provides a historical and functional context for the squat, which is relevant to understanding its importance.')], logical_inferences=[LogicalInference(id='LI1', description=\"Explanation of Squat's Purpose\", details='The segment explains the original purpose of the squat position for defecation and how it is a natural resting position for children, contrasting it with the common misconception of it being just an exercise.')])),\n", - " SegmentWithClueInfo(start_timestamp='00:00:29.000', end_timestamp='00:00:45.000', segment_annotation=SegmentAnnotation(local_clues=[LocalClue(id='LC1', quote=\"not for an exercise necessarily not because of fitness or to be fitter but just because you're designed to do it\", quote_timestamp_start='00:00:38', quote_timestamp_end='00:00:40', clue='Emphasis on the squat being a natural position rather than just an exercise.'), LocalClue(id='LC2', quote=\"this here is just the human resting position so when I run classes people always talk about the squat how they can't quite get their heel down on the floor\", quote_timestamp_start='00:00:43', quote_timestamp_end='00:00:45', clue='Description of the squat as a human resting position and common issue of not being able to get heels down.')], global_clues=[GlobalClue(id='GC1', quote=\"this here is just the human resting position so when I run classes people always talk about the squat how they can't quite get their heel down on the floor\", quote_timestamp_start='00:00:43', quote_timestamp_end='00:00:45', clue='Description of the squat as a human resting position and common issue of not being able to get heels down.', relevance_to_segment='This provides context for the segment by explaining the squat as a natural resting position and a common issue people face.')], logical_inferences=[LogicalInference(id='LI1', description='Natural Resting Position', details='The segment emphasizes that the squat is a natural human resting position and not just an exercise, highlighting a common issue people face with getting their heels down.')])),\n", - " SegmentWithClueInfo(start_timestamp='00:00:49.000', end_timestamp='00:00:53.000', segment_annotation=SegmentAnnotation(local_clues=[LocalClue(id='LC1', quote=\"since about the age of four years old you've been wearing shoes\", quote_timestamp_start='00:00:53', quote_timestamp_end='00:00:55', clue='Introduction of the idea that wearing shoes from a young age affects squat mobility.')], global_clues=[GlobalClue(id='GC1', quote=\"since about the age of four years old you've been wearing shoes\", quote_timestamp_start='00:00:53', quote_timestamp_end='00:00:55', clue='Introduction of the idea that wearing shoes from a young age affects squat mobility.', relevance_to_segment='This provides context for the segment by explaining how wearing shoes from a young age affects squat mobility.')], logical_inferences=[LogicalInference(id='LI1', description='Impact of Wearing Shoes', details='The segment introduces the idea that wearing shoes from a young age affects squat mobility, setting the stage for further explanation.')])),\n", - " SegmentWithClueInfo(start_timestamp='00:01:02.000', end_timestamp='00:01:06.000', segment_annotation=SegmentAnnotation(local_clues=[LocalClue(id='LC1', quote=\"we are an animal we're designed to walk around on bare feet and the reason that we'll lose our squat mobility is simply because we come to become tight in our ankles\", quote_timestamp_start='00:01:02', quote_timestamp_end='00:01:07', clue='Explanation of how wearing shoes leads to tight ankles and loss of squat mobility.')], global_clues=[GlobalClue(id='GC1', quote=\"we are an animal we're designed to walk around on bare feet and the reason that we'll lose our squat mobility is simply because we come to become tight in our ankles\", quote_timestamp_start='00:01:02', quote_timestamp_end='00:01:07', clue='Explanation of how wearing shoes leads to tight ankles and loss of squat mobility.', relevance_to_segment='This provides context for the segment by explaining how wearing shoes leads to tight ankles and loss of squat mobility.')], logical_inferences=[LogicalInference(id='LI1', description='Loss of Squat Mobility', details='The segment explains that wearing shoes leads to tight ankles and loss of squat mobility, emphasizing the importance of walking barefoot.')])),\n", - " SegmentWithClueInfo(start_timestamp='00:01:43.000', end_timestamp='00:01:49.000', segment_annotation=SegmentAnnotation(local_clues=[LocalClue(id='LC1', quote='if your ankles have become tight then squat is really difficult and hard', quote_timestamp_start='00:01:45', quote_timestamp_end='00:01:47', clue='This phrase explains the difficulty of squatting due to tight ankles, which is likely demonstrated in this segment.'), LocalClue(id='LC2', quote='but like every other muscle and joint in the body is trainable', quote_timestamp_start='00:01:47', quote_timestamp_end='00:01:49', clue='This phrase suggests that despite the difficulty, ankle mobility can be improved with training.')], global_clues=[GlobalClue(id='GC1', quote=\"if you put your heels on a yoga block or a book you're in that position rather than that position that will take away your ankle mobility and allow you to get all the way down\", quote_timestamp_start='00:01:12', quote_timestamp_end='00:01:16', clue='This quote explains a method to improve squat depth by using a yoga block or book to compensate for limited ankle mobility.', relevance_to_segment='This provides context for the difficulty mentioned in LC1 and suggests a solution for improving squat form.'), GlobalClue(id='GC2', quote=\"so really a squat is just a test of your ankles if you've got good ankles you're great at squatting\", quote_timestamp_start='00:01:39', quote_timestamp_end='00:01:42', clue='This quote emphasizes the importance of ankle mobility in performing a proper squat.', relevance_to_segment='This reinforces the idea presented in LC1 that tight ankles make squatting difficult.')], logical_inferences=[LogicalInference(id='LI1', description='Ankle Mobility Focus', details='The segment likely demonstrates the impact of tight ankles on squat performance and suggests that improving ankle mobility is key to better squatting.'), LogicalInference(id='LI2', description='Trainability of Ankle Mobility', details='The segment emphasizes that, like other muscles and joints, ankle mobility can be improved with consistent training, making squatting easier over time.')])),\n", - " SegmentWithClueInfo(start_timestamp='00:02:18.000', end_timestamp='00:02:27.000', segment_annotation=SegmentAnnotation(local_clues=[LocalClue(id='LC1', quote='first one is the squat sitting in a squat not bobbing up and down just being able to function to sit in the squat', quote_timestamp_start='00:02:18', quote_timestamp_end='00:02:22', clue='This phrase introduces the exercise of sitting in a squat position without movement, emphasizing stability and function.'), LocalClue(id='LC2', quote='something again is innate to us as human beings and something that you are able to do as a child', quote_timestamp_start='00:02:22', quote_timestamp_end='00:02:27', clue='This phrase highlights that the ability to squat is a natural human function that we are born with.')], global_clues=[GlobalClue(id='GC1', quote=\"so by doing 10 minutes every day you'll get into the habit\", quote_timestamp_start='00:02:05', quote_timestamp_end='00:02:08', clue='This quote suggests that consistent daily practice can help regain the ability to squat.', relevance_to_segment='This provides context for the exercise introduced in LC1, emphasizing the importance of regular practice.'), GlobalClue(id='GC2', quote='spend 10 minutes every day getting your squat back', quote_timestamp_start='00:02:31', quote_timestamp_end='00:02:34', clue='This quote reinforces the idea of daily practice to improve squatting ability.', relevance_to_segment='This supports the exercise mentioned in LC1 and LC2, suggesting that daily practice is key to regaining natural squatting ability.')], logical_inferences=[LogicalInference(id='LI1', description='Importance of Daily Practice', details='The segment likely emphasizes the importance of spending a few minutes each day practicing sitting in a squat to regain natural squatting ability.'), LogicalInference(id='LI2', description='Natural Human Function', details='The segment underscores that squatting is an innate human ability that can be restored through consistent practice.')])),\n", - " SegmentWithClueInfo(start_timestamp='00:00:01.000', end_timestamp='00:00:44.000', segment_annotation=SegmentAnnotation(local_clues=[LocalClue(id='LC1', quote='start with your feet slightly wider than shoulder width apart', quote_timestamp_start='00:00:02', quote_timestamp_end='00:00:04', clue='This instruction sets the initial stance for performing a proper squat.'), LocalClue(id='LC2', quote='cross your arms in front so touch your right hand to your left shoulder and vice versa pointing your elbows straight ahead', quote_timestamp_start='00:00:06', quote_timestamp_end='00:00:13', clue='Describes the arm position to maintain balance and proper form during the squat.'), LocalClue(id='LC3', quote='shift your weight to the ball of your feet and bend your knees', quote_timestamp_start='00:00:15', quote_timestamp_end='00:00:18', clue='Guides the weight distribution and initial movement for the squat.'), LocalClue(id='LC4', quote='get as close to 90 degrees as you can looking straight ahead and from here push back up to the starting position', quote_timestamp_start='00:00:21', quote_timestamp_end='00:00:25', clue='Describes the depth of the squat and the motion to return to the starting position.'), LocalClue(id='LC5', quote=\"this is going to be great for strengthening your thighs or your quadriceps as well as your butt or your glutes and it's also going to be great to work on your posture\", quote_timestamp_start='00:00:29', quote_timestamp_end='00:00:37', clue='Explains the benefits of performing squats correctly, including muscle strengthening and posture improvement.'), LocalClue(id='LC6', quote=\"that's how to properly perform a squat\", quote_timestamp_start='00:00:41', quote_timestamp_end='00:00:44', clue='Concludes the segment by summarizing that the instructions provided are for performing a proper squat.')], global_clues=[GlobalClue(id='GC1', quote=\"let's learn how to properly perform a squat\", quote_timestamp_start='00:00:00', quote_timestamp_end='00:00:01', clue='Introduces the topic of the video, which is learning the proper technique for squatting.', relevance_to_segment='This statement sets the context for the entire segment, indicating that the instructions provided are aimed at teaching the correct squat form.'), GlobalClue(id='GC2', quote=\"this is going to be great for strengthening your thighs or your quadriceps as well as your butt or your glutes and it's also going to be great to work on your posture\", quote_timestamp_start='00:00:29', quote_timestamp_end='00:00:37', clue='Explains the benefits of performing squats correctly, including muscle strengthening and posture improvement.', relevance_to_segment='This reinforces the importance of following the proper technique as described in the segment to achieve these benefits.')], logical_inferences=[LogicalInference(id='LI1', description='Step-by-Step Instruction for Proper Squat', details='The segment provides a detailed, step-by-step guide on how to perform a squat correctly, starting from the initial stance to the final position. This includes specific instructions on foot placement, arm positioning, weight distribution, and movement.'), LogicalInference(id='LI2', description='Emphasis on Proper Form and Benefits', details='The segment emphasizes the importance of proper form in performing squats to avoid injury and maximize benefits such as strengthening the quadriceps, glutes, and improving posture.'), LogicalInference(id='LI3', description='Educational Purpose', details='The primary purpose of this segment is educational, aiming to teach viewers the correct technique for squatting to ensure they perform the exercise safely and effectively.')]))],\n", - " 'annotations': ['{\"squats_probability\": \"low\", \"squats_technique_correctness\": null, \"squats_feedback\": {\"right\": null, \"wrong\": null, \"correction\": null}}',\n", - " '{\"squats_probability\": \"high\", \"squats_technique_correctness\": null, \"squats_feedback\": {\"right\": null, \"wrong\": null, \"correction\": null}}',\n", - " '{\"squats_probability\": \"high\", \"squats_technique_correctness\": \"incorrect\", \"squats_feedback\": {\"right\": \"The segment emphasizes the squat as a natural human resting position, which is a correct understanding.\", \"wrong\": \"The segment mentions a common issue where people can\\'t get their heels down on the floor during a squat.\", \"correction\": \"Focus on ankle mobility exercises and practice squatting with support to gradually improve heel placement.\"}}',\n", - " '{\"squats_probability\": \"unknown\", \"squats_technique_correctness\": null, \"squats_feedback\": {\"right\": null, \"wrong\": null, \"correction\": null}}',\n", - " '{\"squats_probability\": \"high\", \"squats_technique_correctness\": null, \"squats_feedback\": {\"right\": null, \"wrong\": null, \"correction\": null}}',\n", - " '{\"squats_probability\": \"high\", \"squats_technique_correctness\": \"incorrect\", \"squats_feedback\": {\"right\": \"The segment emphasizes the importance of ankle mobility in squatting and suggests that it can be improved with training.\", \"wrong\": \"The segment likely demonstrates difficulty in squatting due to tight ankles.\", \"correction\": \"To improve squat depth and form, consider placing your heels on a yoga block or a book to compensate for limited ankle mobility.\"}}',\n", - " '{\"squats_probability\": \"high\", \"squats_technique_correctness\": \"unknown\", \"squats_feedback\": {\"right\": \"The segment emphasizes the importance of stability and function in the squat position, highlighting that squatting is a natural human ability.\", \"wrong\": null, \"correction\": \"Practice sitting in a squat position daily for 10 minutes to regain natural squatting ability.\"}}',\n", - " '{\"squats_probability\": \"high\", \"squats_technique_correctness\": \"correct\", \"squats_feedback\": {\"right\": \"The segment provides a detailed, step-by-step guide on how to perform a squat correctly, including specific instructions on foot placement, arm positioning, weight distribution, and movement.\", \"wrong\": null, \"correction\": null}}']}" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "graph.get_state(thread).values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "datagen", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.0" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/agent/agent.py b/agent/agent.py new file mode 100644 index 0000000..abc6611 --- /dev/null +++ b/agent/agent.py @@ -0,0 +1,113 @@ +from typing import TypedDict, Annotated, Sequence, List, Optional +import operator +from langchain_openai import AzureChatOpenAI +from langgraph.graph import StateGraph, END +from langgraph.checkpoint.memory import MemorySaver + +from tools.scraping import gen_queries, get_video_ids, download, VideoInfo +from tools.video_chunking import detect_segments, SegmentInfo +from tools.annotating import extract_clues, gen_annotations + +from tools.prompts import ( + GEN_QUERIES_PROMPT, + EXTRACT_CLUES_PROMPT, + GEN_ANNOTATIONS_PROMPT, +) + + +llm = AzureChatOpenAI( + temperature=0.0, + azure_deployment="gpt4o", + openai_api_version="2023-07-01-preview", +) + +memory = MemorySaver() +# memory = SqliteSaver.from_conn_string(":memory:") + + +class AgentState(TypedDict): + task: str + search_queries: List[str] + video_ids: List[str] + video_infos: List[VideoInfo] + clip_text_prompts: List[str] + segment_infos: List[SegmentInfo] + clues: List[str] + annotations: List[str] + + +class DataAgent: + def __init__(self, llm, memory): + self.llm = llm + self.memory = memory + self.graph = self.build_graph() + + def build_graph(self): + builder = StateGraph(AgentState) + + builder.add_node("generate_queries", self.gen_queries_node) + builder.add_node("get_video_ids", self.get_video_ids_node) + builder.add_node("download", self.download_node) + builder.add_node("detect_segments", self.detect_segments_node) + builder.add_node("extract_clues", self.extract_clues_node) + builder.add_node("gen_annotations", self.gen_annotations_node) + + builder.set_entry_point("generate_queries") + + builder.add_edge("generate_queries", "get_video_ids") + builder.add_edge("get_video_ids", "download") + builder.add_edge("download", "detect_segments") + builder.add_edge("detect_segments", "extract_clues") + builder.add_edge("extract_clues", "gen_annotations") + builder.add_edge("gen_annotations", END) + + graph = builder.compile(checkpointer=memory) + + return graph + + def gen_queries_node(self, state: AgentState): + search_queries = gen_queries(self.llm, state["task"], GEN_QUERIES_PROMPT) + return {"search_queries": search_queries[:2]} + + def get_video_ids_node(self, state: AgentState): + video_ids = get_video_ids(state["search_queries"]) + return {"video_ids": video_ids} + + def download_node(self, state: AgentState): + video_infos = download(state["video_ids"]) + return {"video_infos": video_infos} + + def detect_segments_node(self, state: AgentState): + segment_infos = detect_segments( + state["video_infos"], state["clip_text_prompts"] + ) + return {"segment_infos": segment_infos} + + def extract_clues_node(self, state: AgentState): + clues = extract_clues( + self.llm, + EXTRACT_CLUES_PROMPT, + state["segment_infos"], + state["video_infos"], + ) + return {"clues": clues} + + def gen_annotations_node(self, state: AgentState): + annotations = gen_annotations(self.llm, GEN_ANNOTATIONS_PROMPT, state["clues"]) + return {"annotations": annotations} + + def run(self, thread_id: str): + thread = {"configurable": {"thread_id": thread_id}} + for step in self.graph.stream( + { + "task": "i wanna teach people how to do squats", + "clip_text_prompts": ["person doing squats"], + }, + thread, + ): + if "download" in step: + print("dowload happened") + elif "extract_clues" in step: + print("extract_clues happened") + else: + print(step) diff --git a/agent/agent_parts.ipynb b/agent/agent_parts.ipynb new file mode 100644 index 0000000..a482a21 --- /dev/null +++ b/agent/agent_parts.ipynb @@ -0,0 +1,946 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dotenv import load_dotenv\n", + "_ = load_dotenv()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from typing import TypedDict, Annotated, Sequence, List, Optional\n", + "import operator\n", + "\n", + "from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage\n", + "from langchain.pydantic_v1 import BaseModel, Field\n", + "from langchain_core.prompts import ChatPromptTemplate" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_openai import AzureChatOpenAI\n", + "\n", + "llm = AzureChatOpenAI(\n", + " temperature=0.0,\n", + " azure_deployment=\"gpt4o\",\n", + " openai_api_version=\"2023-07-01-preview\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class VideoInfo(BaseModel):\n", + " video_id: str\n", + " url: str\n", + " relative_video_path: str\n", + " subs: str\n", + " transcript: str\n", + "\n", + "\n", + "class SegmentInfo(BaseModel):\n", + " start_timestamp: str\n", + " end_timestamp: str\n", + " fps: float\n", + " video_id: str\n", + "\n", + "\n", + "class LocalClue(BaseModel):\n", + " \"\"\"Local clues for a segment\"\"\"\n", + "\n", + " id: str = Field(description=\"LC1,LC2...\")\n", + " quote: str = Field(\n", + " description=\"the quote from the transcript that was used to create this clue.\"\n", + " )\n", + " quote_timestamp_start: str = Field(\n", + " description=\"the exact start timestamp of the quote.\"\n", + " )\n", + " quote_timestamp_end: str = Field(\n", + " description=\"the exact end timestamp of the quote.\"\n", + " )\n", + " clue: str = Field(description=\"the main clue data\")\n", + "\n", + "\n", + "class GlobalClue(BaseModel):\n", + " \"\"\"Global clues for a segment\"\"\"\n", + "\n", + " id: str = Field(description=\"GC1,GC2...\")\n", + " quote: str = Field(\n", + " description=\"the quote from the transcript that was used to create this clue.\"\n", + " )\n", + " quote_timestamp_start: str = Field(\n", + " description=\"the exact start timestamp of the quote.\"\n", + " )\n", + " quote_timestamp_end: str = Field(\n", + " description=\"the exact end timestamp of the quote.\"\n", + " )\n", + " clue: str = Field(description=\"the main clue data.\")\n", + " relevance_to_segment: str = Field(\n", + " description=\"why do you think this global clue is relevant to the segment you are working with right now.\"\n", + " )\n", + "\n", + "\n", + "class LogicalInference(BaseModel):\n", + " \"\"\"Logical inferences for a segment\"\"\"\n", + "\n", + " id: str = Field(description=\"LI1,LI2,...\")\n", + " description: str = Field(description=\"A concise form of the logical inference.\")\n", + " details: str = Field(\n", + " description=\"A verbose explanation of what insight about what happens in this segment should be made based on the clues that you found.\"\n", + " )\n", + "\n", + "\n", + "class SegmentAnnotation(BaseModel):\n", + " local_clues: list[LocalClue] = Field(\n", + " description=\"Local clues are inside the segment in terms of timestamps.\"\n", + " )\n", + " global_clues: list[GlobalClue] = Field(\n", + " description=\"Global clues are scattered across the entire transcript.\"\n", + " )\n", + " logical_inferences: list[LogicalInference] = Field(\n", + " description=\"What can we infer about the topic, that the user is looking for in the video, can we make based on the clues inside this segment\"\n", + " )\n", + "\n", + "\n", + "class SegmentWithClueInfo(BaseModel):\n", + " \"\"\"\n", + " Annotation for a video segment.\n", + " \"\"\"\n", + "\n", + " start_timestamp: str = Field(\n", + " description=\"start timestamp of the segment in format HH:MM:SS.MS\"\n", + " )\n", + " end_timestamp: str = Field(\n", + " description=\"start timestamp of the segment in format HH:MM:SS.MS\"\n", + " )\n", + " segment_annotation: SegmentAnnotation = Field(\n", + " description=\"list of annotations for the segment\"\n", + " )\n", + "\n", + "\n", + "class VideoAnnotation(BaseModel):\n", + " \"\"\"\n", + " Segments of a video.\n", + " \"\"\"\n", + "\n", + " segments: list[SegmentWithClueInfo] = Field(\n", + " description=\"information about each segment\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 2. Create the state\n", + "\n", + "\n", + "class AgentState(TypedDict):\n", + " task: str\n", + " search_queries: List[str]\n", + " video_ids: List[str]\n", + " video_infos: List[VideoInfo]\n", + " clip_text_prompts: List[str]\n", + " segment_infos: List[SegmentInfo]\n", + " clues: List[str]\n", + " annotations: List[str]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 3. Set prompts\n", + "\n", + "GEN_QUERIES_PROMPT = (\n", + " \"You a helping the user to find a very large and diverse set of videos on a video hosting service.\",\n", + " \"A user will only describe which videos they are looking for and how many queries they need.\",\n", + ")\n", + "\n", + "# prompt='I want to find instructional videos about how to do squats.',\n", + "# num_queries_prompt = f'I need {num_queries} queries'\n", + "\n", + "EXTRACT_CLUES_PROMPT = \"\"\"You are a highly intelligent data investigator. \n", + "You take unstructured damaged data and look for clues that could help restore the initial information\n", + "and extract important insights from it.\n", + "You are the best one for this job in the world because you are a former detective. \n", + "You care about even the smallest details, and your guesses about what happened in the initial file\n", + "even at very limited inputs are usually absolutely right. \n", + "You use deductive and inductive reasoning at the highest possible quality.\n", + "\n", + "#YOUR TODAY'S JOB\n", + "The user needs to learn about what happens in a specific segment of a video file. Your job is to help the user by providing clues that would help the user make the right assumption.\n", + "The user will provide you with: \n", + "1. Instructions about what kind of information the user is trying to obtain.\n", + "2. A list of time codes of the segments in format \"-\". All the provided segment of the video contain what the user is looking for, but other parts of the video might have different content.\n", + "3. A transcript of the *full video* in format of \"\\\\n\"\n", + "\n", + "Your task:\n", + "1. Read the transcript.\n", + "2. Provide the clues in a given format.\n", + "3. Provied any other info requested by the user.\n", + "\n", + "#RULES\n", + "!!! VERY IMPORTANT !!!\n", + "1. Rely only on the data provided in the transcript. Do not improvise. All the quotes and corresponding timestamps must be taken from the transcript. Quote timestamps must be taken directly from the transcript.\n", + "2. Your job is to find the data already provided in the transcript.\n", + "3. Analyze every segment. Only skip a segment if there is no information about it in the trascript.\n", + "4. For local clues, make sure that the quotes that you provide are located inside the segment. To do this, double check the timestamps from the transcript and the segment.\n", + "5. For all clues, make sure that the quotes exactly correspond to the timestamps that you provide.\n", + "6. When making clues, try as much as possible to make them describe specifically what is shown in the segment.\n", + "7. Follow the format output.\n", + "8. Be very careful with details. Don't generalize. Always double check your results.\n", + "\n", + "Please, help the user find relevant clues to reconstruct the information they are looking for, for each provided segment.\n", + "\n", + "WHAT IS A CLUE: A *clue*, in the context of reconstructing narratives from damaged data, \n", + "is a fragment of information extracted from a corrupted or incomplete source that provides \n", + "insight into the original content. These fragments serve as starting points for inference \n", + "and deduction, allowing researchers to hypothesize about the fuller context or meaning of \n", + "the degraded material. The process of identifying and interpreting clues involves both objective analysis of the \n", + "available data and subjective extrapolation based on domain knowledge, contextual understanding, \n", + "and logical reasoning.\n", + "\n", + "Here is what the user expects to have from you:\n", + "1. *Local clues* that would help the user undestand how the thing they are looking for happens inside the segment. Local clues for a segment are generated from quotes inside a specific segment.\n", + "2. *Global clues* that would help the user understand how the thing they are looking for happens inside the segment. Global clues for a segment are generated from quotes all around the video, but are very relevant to the specific that they are provided for.\n", + "3. *Logical inferences* that could help the user understand how the thing they are looking for happens inside the segment. Logical inferences for a segment are deducted from local and global clues for this segment.\n", + "\n", + "!!!IT IS EXTREMELY IMPORTANT TO DELIVER ALL THREE THINGS!!!\n", + "\n", + " Good local clues examples: [\n", + " {\n", + " \"id\": \"LC1\",\n", + " \"timestamp\": \"00:00:19\",\n", + " \"quote\": \"exercises do them wrong and instead of\",\n", + " \"clue\": \"This phrase introduces the concept of incorrect exercise form, setting the stage for a demonstration of improper technique.\"\n", + " },\n", + " {\n", + " \"id\": \"LC2\",\n", + " \"timestamp\": \"00:00:21\",\n", + " \"quote\": \"growing nice quads and glutes you'll\",\n", + " \"clue\": \"Mentions the expected benefits of proper squats (muscle growth), implying that these benefits won't be achieved with incorrect form.\"\n", + " },\n", + " {\n", + " \"id\": \"LC3\",\n", + " \"timestamp\": \"00:00:22\",\n", + " \"quote\": \"feel aches and pains in your knees your\",\n", + " \"clue\": \"Directly states negative consequences of improper form, strongly suggesting that this segment demonstrates incorrect technique.\"\n", + " },\n", + " {\n", + " \"id\": \"LC4\",\n", + " \"timestamp\": \"00:00:24\",\n", + " \"quote\": \"lower back and even your shoulders\",\n", + " \"clue\": \"Continuation of LC3, emphasizing multiple areas of potential pain from improper form.\"\n", + " },\n", + " {\n", + " \"id\": \"LC5\",\n", + " \"timestamp\": \"00:00:26\",\n", + " \"quote\": \"let's see how to do it correctly\",\n", + " \"clue\": \"This phrase suggests a transition is about to occur. The incorrect form has been shown, and correct form will follow.\"\n", + " }\n", + " ]\n", + "\n", + " Double check that the timestamp and the quote that you provide exactly correspond to what you found in the transcript.\n", + " For example, if the transcript says:\n", + " \"00:05:02\n", + " he took the glasses\n", + " 00:05:04\n", + " and gave them to me\"\n", + " Then a GOOD output will be:\n", + " - timestamp: 00:05:03\n", + " - quote: \"he took the glasses and gave them to me\"\n", + " And a BAD output would be:\n", + " - timestamp: 00:04:02\n", + " - quote: \"he gave me the glasses\"\n", + "\n", + " Good global clues examples: [\n", + " {\n", + " \"id\": \"GC1\",\n", + " \"timestamp\": \"00:01:15\",\n", + " \"quote\": \"Before we dive into specific techniques, let's talk about safety.\",\n", + " \"clue\": \"Introduces the theme of safety in squatting.\",\n", + " \"relevance_to_segment\": \"This earlier emphasis on safety provides context for why proper depth is important and why it's being addressed in our segment. It connects to the fear of knee pain mentioned in LC3.\"\n", + " },\n", + " {\n", + " \"id\": \"GC2\",\n", + " \"timestamp\": \"00:02:30\",\n", + " \"quote\": \"Squatting is a fundamental movement pattern in everyday life.\",\n", + " \"clue\": \"Emphasizes the importance of squats beyond just exercise.\",\n", + " \"relevance_to_segment\": \"This broader context heightens the importance of learning proper squat depth as demonstrated in our segment. It suggests that the techniques shown have applications beyond just gym workouts.\"\n", + " },\n", + " {\n", + " \"clue_id\": \"GC3\",\n", + " \"timestamp\": \"00:05:20\",\n", + " \"quote\": \"If you have existing knee issues, consult a physician before attempting deep squats.\",\n", + " \"clue\": \"Provides a health disclaimer related to squat depth.\",\n", + " \"relevance_to_segment\": \"While this comes after our segment, it's relevant because it addresses the concern about knee pain mentioned in LC3. It suggests that the demonstration in our segment is generally safe but acknowledges individual variations.\"\n", + " },\n", + " {\n", + " \"clue_id\": \"GC4\",\n", + " \"timestamp\": \"00:06:45\",\n", + " \"quote\": \"Proper depth ensures full engagement of your quadriceps and glutes.\",\n", + " \"clue\": \"Explains the benefit of correct squat depth.\",\n", + " \"relevance_to_segment\": \"This later explanation provides justification for the depth guideline given in LC4. It helps viewers understand why the demonstrated technique is important.\"\n", + " },\n", + " {\n", + " \"clue_id\": \"GC5\",\n", + " \"timestamp\": \"00:00:30\",\n", + " \"quote\": \"Today, we'll cover squat variations for beginners to advanced lifters.\",\n", + " \"clue\": \"Outlines the scope of the entire video.\",\n", + " \"relevance_to_segment\": \"This early statement suggests that our segment, focusing on proper depth, is part of a comprehensive guide. It implies that the demonstration might be adaptable for different skill levels.\"\n", + " }\n", + " ]\n", + " Double check that the timestamp and the quote that you provide exactly correspond to what you found in the transcript.\n", + " For example, if the transcript says:\n", + " \"00:05:02\n", + " he took the glasses\n", + " 00:05:04\n", + " and gave them to me\"\n", + " Then a GOOD output will be:\n", + " - timestamp: 00:05:03\n", + " - quote: \"he took the glasses and gave them to me\"\n", + " And a BAD output would be:\n", + " - timestamp: 00:04:02\n", + " - quote: \"he gave me the glasses\"\n", + " \n", + "\n", + " Good logical inference examples:\n", + " [\n", + " {\n", + " \"id\": \"LI1\",\n", + " \"description\": \"Primary Demonstration of Heel Lift\",\n", + " \"details\": \"Given that GC1-GC3 describe the 'most common mistake' as heels lifting off the ground, and this description immediately precedes our segment, it's highly probable that this is the primary error being demonstrated. This is further supported by the segment's focus on incorrect form (LC1-LC4).\"\n", + " },\n", + " {\n", + " \"id\": \"LI2\",\n", + " \"description\": \"Multiple Error Demonstration\",\n", + " \"details\": \"While heel lift is likely the primary focus, the mention of multiple pain points (knees, lower back, shoulders in LC3-LC4) suggests that the demonstrator may be exhibiting several forms of incorrect technique simultaneously. This comprehensive 'what not to do' approach would be pedagogically effective.\"\n", + " },\n", + " {\n", + " \"id\": \"LI3\",\n", + " \"description\": \"Possible Inclusion of 'Butt Wink'\",\n", + " \"details\": \"Although 'butt wink' is mentioned after our segment (GC4-GC6), its connection to back pain (which is mentioned in LC4) raises the possibility that this error is also present in the demonstration. The instructor may be showing multiple errors early on, then breaking them down individually later.\"\n", + " },\n", + " {\n", + " \"id\": \"LI4\",\n", + " \"description\": \"Segment Placement in Overall Video Structure\",\n", + " \"details\": \"The segment's position (starting at 00:00:19) and the phrase 'let's see how to do it correctly' (LC5) at the end suggest this is an early, foundational part of the video. It likely serves to grab attention by showing common mistakes before transitioning to proper form instruction.\"\n", + " },\n", + " {\n", + " \"id\": \"LI5\",\n", + " \"description\": \"Intentional Exaggeration of Errors\",\n", + " \"details\": \"Given the educational nature of the video, it's plausible that the demonstrator is intentionally exaggerating the incorrect form. This would make the errors more obvious to viewers and enhance the contrast with correct form shown later.\"\n", + " }\n", + " ]\n", + "\"\"\"\n", + "\n", + "\n", + "GEN_ANNOTATIONS_PROMPT = \"\"\"You are a helpful assistant that performs high quality data investigation and transformation.\n", + " You will be given a JSON object with clues and other helpful information about what's going on \n", + " in a specific part of a video file. This part is called a segment. Your job is to:\n", + " 1. Read this JSON object carefully\n", + " 2. Answer user's questions about this segment\n", + " 3. Provide the answer as a JSON object in a schema provided by the user\n", + " Important rules:\n", + " 1. You can only rely on data presented in a provided JSON object. Don't improvise.\n", + " 2. Follow user's request carefully.\n", + " 3. Don't rush to deliver the answer. Take some time to think. Make a deep breath. Then start writing.\n", + " 4. If you want to output field as empty (null), output it as JSON null (without quotes), not as a string \"null\". \n", + "—> GOOD EXAMPLES:\n", + " \"wrong\":\"Knees caving in: This can stress the knees and reduce effectiveness\"\n", + " \"correction\":\"Focus on keeping knees aligned with your toes.\"\n", + " \"wrong\":\"Rounding the back: This increases the risk of back injuries\"\n", + " \"correction\":\"Keep your chest up and maintain a neutral spine throughout the movement.\"\n", + " \"wrong\":\"Heels are lifting off the ground: this shifts the weight forward, reducing stability\"\n", + " \"correction\":\" Keep your weight on your heels and press through them as you rise.\"\n", + " \"right\":\"Chest and shoulders: The chest is up, and the shoulders are back, maintaining an upright torso.\"\n", + " \"correction\":null\n", + "—> BAD EXAMPLES:\n", + " \"wrong\":\"knees\"\n", + " \"correction\":\"fix knees\"\n", + " \"wrong\":\"back looks funny\"\n", + " \"correction\":\"make back better\"\n", + " \"wrong\":\"feet are doing something\"\n", + " \"correction\":\"feet should be different\"\n", + " \"right\":\"arms\"\n", + " \"correction\":\"arms are fine i think\"\n", + "—> BAD EXAMPLES END HERE\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import scrapetube\n", + "import yt_dlp\n", + "from datetime import datetime\n", + "from pathlib import Path\n", + "from collections import defaultdict\n", + "from datagen.core.sub_utils import vtt_to_txt\n", + "from datagen.detect_segments import get_segments\n", + "import torch\n", + "from transformers import AutoModel, AutoProcessor\n", + "import pandas as pd\n", + "from tsmoothie.smoother import LowessSmoother" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import decord\n", + "import math\n", + "import numpy as np\n", + "\n", + "# decord.bridge.set_bridge(\"torch\")\n", + "\n", + "\n", + "class VideoInferenceDataset(torch.utils.data.IterableDataset):\n", + " def __init__(self, video_infos: List[VideoInfo], local_root: Path):\n", + " super(VideoInferenceDataset).__init__()\n", + "\n", + " self.video_infos = video_infos\n", + " self.local_root = local_root\n", + " self.frame_generator = self.get_frame_generator(video_infos, local_root)\n", + "\n", + " @staticmethod\n", + " def get_frame_generator(video_infos, local_root: Path):\n", + "\n", + " for video_idx, video_info in enumerate(video_infos):\n", + " video_path = local_root.joinpath(video_info.relative_video_path)\n", + " vr = decord.VideoReader(str(video_path))\n", + " num_frames = len(vr)\n", + " fps = vr.get_avg_fps()\n", + " frame_indices = range(0, num_frames, round(fps))\n", + "\n", + " for frame_idx in frame_indices:\n", + " # print(f\"Frame idx {frame_idx}\")\n", + " frame = vr[frame_idx].asnumpy()\n", + " yield {\n", + " \"frame\": frame,\n", + " \"frame_idx\": frame_idx,\n", + " \"video_id\": video_idx,\n", + " }\n", + "\n", + " def __next__(self):\n", + " return next(self.frame_generator)\n", + "\n", + " def __iter__(self):\n", + " return self" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "import math\n", + "\n", + "# 4. Create nodes\n", + "\n", + "\n", + "def gen_queries_node(state: AgentState):\n", + " class QueryList(BaseModel):\n", + " \"\"\"A list of queries to find videos on a video hosting service\"\"\"\n", + "\n", + " search_queries: list[str] = Field(default=None, description=\"a list of queries\")\n", + "\n", + " messages = [\n", + " SystemMessage(content=str(GEN_QUERIES_PROMPT)),\n", + " HumanMessage(content=state[\"task\"]),\n", + " ]\n", + "\n", + " model = llm.with_structured_output(QueryList)\n", + " response: QueryList = model.invoke(messages)\n", + "\n", + " return {\"search_queries\": response.search_queries[:2]}\n", + "\n", + "\n", + "def get_video_ids_node(state: AgentState):\n", + "\n", + " queries = state[\"search_queries\"]\n", + " videos_per_query = 1\n", + " sleep = 0\n", + " sort_by = \"relevance\"\n", + " results_type = \"video\"\n", + " only_creative_commons = False\n", + "\n", + " video_ids = set()\n", + " for query in queries:\n", + " for video in scrapetube.get_search(\n", + " query=query,\n", + " limit=videos_per_query,\n", + " sleep=sleep,\n", + " sort_by=sort_by,\n", + " results_type=results_type,\n", + " ):\n", + " video_ids.add(video[\"videoId\"])\n", + " video_ids = list(video_ids)\n", + "\n", + " if only_creative_commons:\n", + " video_ids_cc = []\n", + " for i in video_ids:\n", + " YDL_OPTIONS = {\n", + " \"quiet\": True,\n", + " \"simulate\": True,\n", + " \"forceurl\": True,\n", + " }\n", + " with yt_dlp.YoutubeDL(YDL_OPTIONS) as ydl:\n", + " info = ydl.extract_info(f\"youtube.com/watch?v={i}\", download=False)\n", + " if \"creative commons\" in info.get(\"license\", \"\").lower():\n", + " video_ids_cc.append(i)\n", + " video_ids = video_ids_cc\n", + "\n", + " return {\"video_ids\": video_ids}\n", + "\n", + "\n", + "def download_node(state: AgentState):\n", + "\n", + " LOCAL_ROOT = Path(\"./tmp/agent_squats\").resolve()\n", + " video_dir = LOCAL_ROOT / \"videos\"\n", + " sub_dir = LOCAL_ROOT / \"subs\"\n", + "\n", + " discard_path = LOCAL_ROOT / \"videos_without_subs\"\n", + " discard_path.mkdir(parents=True, exist_ok=True)\n", + "\n", + " video_ids = state[\"video_ids\"]\n", + "\n", + " downloaded_video_ids = [video_path.stem for video_path in video_dir.glob(\"*.mp4\")]\n", + " downloaded_video_ids += [\n", + " video_path.stem for video_path in discard_path.glob(\"*.mp4\")\n", + " ]\n", + "\n", + " print(f\"Downloaded video ids: {downloaded_video_ids}\")\n", + "\n", + " only_with_transcripts = True\n", + "\n", + " YDL_OPTIONS = {\n", + " \"writeautomaticsub\": True,\n", + " \"subtitleslangs\": [\"en\"],\n", + " \"subtitlesformat\": \"vtt\",\n", + " \"overwrites\": False,\n", + " \"format\": \"mp4\",\n", + " \"outtmpl\": {\n", + " \"default\": video_dir.as_posix() + \"/%(id)s.%(ext)s\",\n", + " \"subtitle\": sub_dir.as_posix() + \"/%(id)s.%(ext)s\",\n", + " },\n", + " }\n", + "\n", + " video_infos = []\n", + "\n", + " with yt_dlp.YoutubeDL(YDL_OPTIONS) as ydl:\n", + " for video_id in video_ids:\n", + " url = f\"https://www.youtube.com/watch?v={video_id}\"\n", + "\n", + " if video_id not in downloaded_video_ids:\n", + " try:\n", + " ydl.download(url)\n", + " except Exception as e:\n", + " print(datetime.now(), f\"Error at video {video_id}, skipping\")\n", + " print(datetime.now(), e)\n", + " continue\n", + "\n", + " video_path = Path(ydl.prepare_filename({\"id\": video_id, \"ext\": \"mp4\"}))\n", + " sub_path = Path(\n", + " ydl.prepare_filename(\n", + " {\"id\": video_id, \"ext\": \"en.vtt\"}, dir_type=\"subtitle\"\n", + " )\n", + " )\n", + "\n", + " with sub_path.open(\"r\") as f:\n", + " subs = f.read()\n", + "\n", + " transcript = vtt_to_txt(sub_path)\n", + "\n", + " video_info = VideoInfo(\n", + " video_id=video_id,\n", + " url=url,\n", + " relative_video_path=video_path.relative_to(LOCAL_ROOT).as_posix(),\n", + " subs=subs,\n", + " transcript=transcript,\n", + " )\n", + "\n", + " video_infos.append(video_info)\n", + "\n", + " if only_with_transcripts:\n", + " filtered_video_infos = []\n", + " for video_info in video_infos:\n", + " if video_info.transcript:\n", + " filtered_video_infos.append(video_info)\n", + " else:\n", + " video_path = LOCAL_ROOT / video_info.video_path\n", + " video_path.rename(discard_path / video_path.name)\n", + " video_infos = filtered_video_infos\n", + "\n", + " return {\"video_infos\": video_infos}\n", + "\n", + "\n", + "def detect_segments_node(state: AgentState):\n", + "\n", + " LOCAL_ROOT = Path(\"./tmp/agent_squats\").resolve()\n", + "\n", + " clip_text_prompts = state[\"clip_text_prompts\"]\n", + " video_infos = state[\"video_infos\"]\n", + "\n", + " CLIP_MODEL_ID = \"google/siglip-so400m-patch14-384\"\n", + "\n", + " model = AutoModel.from_pretrained(CLIP_MODEL_ID).to(\"cuda\")\n", + " processor = AutoProcessor.from_pretrained(CLIP_MODEL_ID)\n", + "\n", + " dataset = VideoInferenceDataset(video_infos, LOCAL_ROOT)\n", + "\n", + " dataloader = torch.utils.data.DataLoader(\n", + " dataset,\n", + " num_workers=1,\n", + " batch_size=12,\n", + " pin_memory=True,\n", + " # worker_init_fn=worker_init_fn,\n", + " )\n", + " dataloader = iter(dataloader)\n", + "\n", + " smoother = LowessSmoother(smooth_fraction=0.02, iterations=1)\n", + "\n", + " clip_results_dict = defaultdict(list)\n", + "\n", + " print(\"Init model complete\")\n", + "\n", + " batch_counter = 0\n", + " MAX_BATCHES = 50\n", + "\n", + " while batch_counter < MAX_BATCHES:\n", + " batch_counter += 1\n", + " try:\n", + " start_time = time.time()\n", + " batch = next(dataloader)\n", + " # print(f\"Fetch time: {time.time() - start_time:.2f} seconds\")\n", + " except StopIteration:\n", + " break\n", + "\n", + " inputs = processor(\n", + " images=batch[\"frame\"],\n", + " text=clip_text_prompts,\n", + " return_tensors=\"pt\",\n", + " padding=True,\n", + " truncation=True,\n", + " )\n", + " inputs = {k: v.to(\"cuda\") for k, v in inputs.items()}\n", + "\n", + " outputs = model(**inputs)\n", + "\n", + " logits = outputs.logits_per_image\n", + " probs = torch.nn.functional.sigmoid(logits).detach().cpu().numpy()\n", + "\n", + " for video_idx, frame_idx, prob in zip(\n", + " batch[\"video_id\"], batch[\"frame_idx\"], probs\n", + " ):\n", + " # print(type(video_id.item()), type(frame_idx.item()), type(prob.item()))\n", + " video_id = video_infos[video_idx.item()].video_id\n", + "\n", + " clip_results_dict[\"video_id\"].append(video_id)\n", + " clip_results_dict[\"frame_idx\"].append(frame_idx.item())\n", + " clip_results_dict[\"probs\"].append(prob.item())\n", + "\n", + " print(\"All frames processed\")\n", + " clip_results = pd.DataFrame(clip_results_dict)\n", + " print(\"Dataframe created\")\n", + " print(clip_results)\n", + "\n", + " max_gap_seconds = 1\n", + " fps_sampling = 1\n", + " min_prob = 0.1\n", + " min_segment_seconds = 3\n", + " fps = 25\n", + "\n", + " segment_infos = []\n", + " for video_id, video_clip_results in clip_results.groupby(\"video_id\"):\n", + " probs = video_clip_results[\"probs\"].values\n", + " probs = smoother.smooth(probs).smooth_data[0]\n", + " segments_start_end = get_segments(\n", + " probs,\n", + " max_gap=round(max_gap_seconds * fps_sampling),\n", + " min_prob=min_prob,\n", + " min_segment=round(min_segment_seconds * fps_sampling),\n", + " )\n", + "\n", + " print(f\"Segments for video {video_id}: {segments_start_end}\")\n", + "\n", + " sec2ts = lambda s: time.strftime(\n", + " f\"%H:%M:%S.{round((s%1)*1000):03d}\", time.gmtime(s)\n", + " )\n", + "\n", + " for start, end in segments_start_end:\n", + " segment_infos.append(\n", + " SegmentInfo(\n", + " start_timestamp=sec2ts(start),\n", + " end_timestamp=sec2ts(end),\n", + " fps=fps,\n", + " video_id=video_id,\n", + " )\n", + " )\n", + "\n", + " return {\"segment_infos\": segment_infos}\n", + "\n", + "\n", + "def extract_clues_node(state: AgentState):\n", + "\n", + " prompt_template = ChatPromptTemplate.from_messages(\n", + " [\n", + " (\"system\", EXTRACT_CLUES_PROMPT),\n", + " (\n", + " \"user\",\n", + " \"Segment timecodes: {{ segment_timecodes }}\\nTranscript: {{ transcript }}\",\n", + " ),\n", + " ],\n", + " template_format=\"jinja2\",\n", + " )\n", + "\n", + " model = prompt_template | llm.with_structured_output(VideoAnnotation)\n", + "\n", + " segment_infos_dict = defaultdict(list)\n", + " for segment_info in state[\"segment_infos\"]:\n", + " segment_infos_dict[segment_info.video_id].append(segment_info)\n", + "\n", + " video_infos_dict = {\n", + " video_info.video_id: video_info for video_info in state[\"video_infos\"]\n", + " }\n", + "\n", + " clues = []\n", + "\n", + " for video_id, segment_infos in segment_infos_dict.items():\n", + " transcript = video_infos_dict[video_id].transcript\n", + " segment_infos_chunks = [\n", + " segment_infos[i : i + 5] for i in range(0, len(segment_infos), 5)\n", + " ]\n", + "\n", + " for chunk in segment_infos_chunks:\n", + " video_annotation: VideoAnnotation = model.invoke(\n", + " {\n", + " \"segment_timecodes\": \"\\n\".join(\n", + " [f\"{s.start_timestamp}-{s.end_timestamp}\" for s in chunk]\n", + " ),\n", + " \"transcript\": transcript,\n", + " }\n", + " )\n", + " clues.extend(video_annotation.segments)\n", + "\n", + " return {\"clues\": clues}\n", + "\n", + "\n", + "def gen_annotations_node(state: AgentState):\n", + " class SegmentFeedback(BaseModel):\n", + " right: Optional[str] = Field(description=\"what was right in the performance\")\n", + " wrong: Optional[str] = Field(description=\"what was wrong in the performance\")\n", + " correction: Optional[str] = Field(\n", + " description=\"how and in what ways it the performance could be improved\"\n", + " )\n", + "\n", + " # The segment timestamps are taken from the provided information.\n", + " class SegmentCompleteAnnotation(BaseModel):\n", + " squats_probability: Optional[str] = Field(\n", + " description=\"how high is the probability that the person is doing squats in the segment: low, medium, high, unknown(null)\"\n", + " )\n", + " squats_technique_correctness: Optional[str] = Field(\n", + " description=\"correctness of the squat technique.\"\n", + " )\n", + " squats_feedback: Optional[SegmentFeedback] = Field(\n", + " description=\"what was right and wrong in the squat perfomance in the segment. When the technique is incorrect, provide instructions how to correct them.\"\n", + " )\n", + "\n", + " prompt_template = ChatPromptTemplate.from_messages(\n", + " [\n", + " (\"system\", GEN_ANNOTATIONS_PROMPT),\n", + " (\"user\", \"Clues: {{ clues }}\"),\n", + " ],\n", + " template_format=\"jinja2\",\n", + " )\n", + "\n", + " model = prompt_template | llm.with_structured_output(SegmentCompleteAnnotation)\n", + "\n", + " clues = state[\"clues\"]\n", + "\n", + " annotations = []\n", + " for clue in clues:\n", + " segment_annotation: SegmentCompleteAnnotation = model.invoke(\n", + " {\"clues\": clue.json()}\n", + " )\n", + "\n", + " annotations.append(segment_annotation.json())\n", + "\n", + " print(annotations)\n", + "\n", + " return {\"annotations\": annotations}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langgraph.graph import StateGraph, END\n", + "from typing import TypedDict, Annotated, List\n", + "import operator\n", + "from langgraph.checkpoint.memory import MemorySaver\n", + "\n", + "from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage, AIMessage, ChatMessage\n", + "\n", + "memory = MemorySaver()\n", + "# memory = SqliteSaver.from_conn_string(\":memory:\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "builder = StateGraph(AgentState)\n", + "\n", + "builder.add_node(\"generate_queries\", gen_queries_node)\n", + "builder.add_node(\"get_video_ids\", get_video_ids_node)\n", + "builder.add_node(\"download\", download_node)\n", + "builder.add_node(\"detect_segments\", detect_segments_node)\n", + "builder.add_node(\"extract_clues\", extract_clues_node)\n", + "builder.add_node(\"gen_annotations\", gen_annotations_node)\n", + "\n", + "builder.set_entry_point(\"generate_queries\")\n", + "\n", + "# builder.add_conditional_edges(\n", + "# \"generate\",\n", + "# should_continue,\n", + "# {END: END, \"reflect\": \"reflect\"}\n", + "# )\n", + "\n", + "builder.add_edge(\"generate_queries\", \"get_video_ids\")\n", + "builder.add_edge(\"get_video_ids\", \"download\")\n", + "builder.add_edge(\"download\", \"detect_segments\")\n", + "builder.add_edge(\"detect_segments\", \"extract_clues\")\n", + "builder.add_edge(\"extract_clues\", \"gen_annotations\")\n", + "builder.add_edge(\"gen_annotations\", END)\n", + "\n", + "graph = builder.compile(checkpointer=memory)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "thread = {\"configurable\": {\"thread_id\": \"1\"}}\n", + "for s in graph.stream(\n", + " {\n", + " \"task\": \"i wanna teach people how to do squats\",\n", + " \"clip_text_prompts\": [\"person doing squats\"],\n", + " },\n", + " thread,\n", + "):\n", + " if \"download\" in s:\n", + " print(\"dowload happened\")\n", + " elif \"extract_clues\" in s:\n", + " print(\"extract_clues happened\")\n", + " else:\n", + " print(s)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "graph.get_state(thread).values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "datagen", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/agent/tools/annotating.py b/agent/tools/annotating.py new file mode 100644 index 0000000..d38b547 --- /dev/null +++ b/agent/tools/annotating.py @@ -0,0 +1,181 @@ +from typing import List, Optional +from collections import defaultdict +from langchain.pydantic_v1 import BaseModel, Field +from langchain_core.prompts import ChatPromptTemplate + +# 4. Create nodes + +from .scraping import VideoInfo +from .video_chunking import SegmentInfo + + +class LocalClue(BaseModel): + """Local clues for a segment""" + + id: str = Field(description="LC1,LC2...") + quote: str = Field( + description="the quote from the transcript that was used to create this clue." + ) + quote_timestamp_start: str = Field( + description="the exact start timestamp of the quote." + ) + quote_timestamp_end: str = Field( + description="the exact end timestamp of the quote." + ) + clue: str = Field(description="the main clue data") + + +class GlobalClue(BaseModel): + """Global clues for a segment""" + + id: str = Field(description="GC1,GC2...") + quote: str = Field( + description="the quote from the transcript that was used to create this clue." + ) + quote_timestamp_start: str = Field( + description="the exact start timestamp of the quote." + ) + quote_timestamp_end: str = Field( + description="the exact end timestamp of the quote." + ) + clue: str = Field(description="the main clue data.") + relevance_to_segment: str = Field( + description="why do you think this global clue is relevant to the segment you are working with right now." + ) + + +class LogicalInference(BaseModel): + """Logical inferences for a segment""" + + id: str = Field(description="LI1,LI2,...") + description: str = Field(description="A concise form of the logical inference.") + details: str = Field( + description="A verbose explanation of what insight about what happens in this segment should be made based on the clues that you found." + ) + + +class SegmentAnnotation(BaseModel): + local_clues: list[LocalClue] = Field( + description="Local clues are inside the segment in terms of timestamps." + ) + global_clues: list[GlobalClue] = Field( + description="Global clues are scattered across the entire transcript." + ) + logical_inferences: list[LogicalInference] = Field( + description="What can we infer about the topic, that the user is looking for in the video, can we make based on the clues inside this segment" + ) + + +class SegmentWithClueInfo(BaseModel): + """ + Annotation for a video segment. + """ + + start_timestamp: str = Field( + description="start timestamp of the segment in format HH:MM:SS.MS" + ) + end_timestamp: str = Field( + description="start timestamp of the segment in format HH:MM:SS.MS" + ) + segment_annotation: SegmentAnnotation = Field( + description="list of annotations for the segment" + ) + + +class VideoAnnotation(BaseModel): + """ + Segments of a video. + """ + + segments: list[SegmentWithClueInfo] = Field( + description="information about each segment" + ) + + +def extract_clues( + llm, + system_prompt: str, + segment_infos: List[SegmentInfo], + video_infos: List[VideoInfo], +): + + prompt_template = ChatPromptTemplate.from_messages( + [ + ("system", system_prompt), + ( + "user", + "Segment timecodes: {{ segment_timecodes }}\nTranscript: {{ transcript }}", + ), + ], + template_format="jinja2", + ) + + model = prompt_template | llm.with_structured_output(VideoAnnotation) + + segment_infos_dict = defaultdict(list) + for segment_info in segment_infos: + segment_infos_dict[segment_info.video_id].append(segment_info) + + video_infos_dict = {video_info.video_id: video_info for video_info in video_infos} + + clues = [] + + for video_id, segment_infos in segment_infos_dict.items(): + transcript = video_infos_dict[video_id].transcript + segment_infos_chunks = [ + segment_infos[i : i + 5] for i in range(0, len(segment_infos), 5) + ] + + for chunk in segment_infos_chunks: + video_annotation: VideoAnnotation = model.invoke( + { + "segment_timecodes": "\n".join( + [f"{s.start_timestamp}-{s.end_timestamp}" for s in chunk] + ), + "transcript": transcript, + } + ) + clues.extend(video_annotation.segments) + + return clues + + +def gen_annotations(llm, system_prompt: str, clues: List[SegmentAnnotation]): + class SegmentFeedback(BaseModel): + right: Optional[str] = Field(description="what was right in the performance") + wrong: Optional[str] = Field(description="what was wrong in the performance") + correction: Optional[str] = Field( + description="how and in what ways it the performance could be improved" + ) + + # The segment timestamps are taken from the provided information. + class SegmentCompleteAnnotation(BaseModel): + squats_probability: Optional[str] = Field( + description="how high is the probability that the person is doing squats in the segment: low, medium, high, unknown(null)" + ) + squats_technique_correctness: Optional[str] = Field( + description="correctness of the squat technique." + ) + squats_feedback: Optional[SegmentFeedback] = Field( + description="what was right and wrong in the squat perfomance in the segment. When the technique is incorrect, provide instructions how to correct them." + ) + + prompt_template = ChatPromptTemplate.from_messages( + [ + ("system", system_prompt), + ("user", "Clues: {{ clues }}"), + ], + template_format="jinja2", + ) + + model = prompt_template | llm.with_structured_output(SegmentCompleteAnnotation) + + annotations = [] + for clue in clues: + segment_annotation: SegmentCompleteAnnotation = model.invoke( + {"clues": clue.json()} + ) + + annotations.append(segment_annotation.json()) + + return annotations diff --git a/agent/tools/prompts.py b/agent/tools/prompts.py new file mode 100644 index 0000000..bc99503 --- /dev/null +++ b/agent/tools/prompts.py @@ -0,0 +1,217 @@ +# 3. Set prompts + +GEN_QUERIES_PROMPT = ( + "You a helping the user to find a very large and diverse set of videos on a video hosting service.", + "A user will only describe which videos they are looking for and how many queries they need.", +) + +# prompt='I want to find instructional videos about how to do squats.', +# num_queries_prompt = f'I need {num_queries} queries' + +EXTRACT_CLUES_PROMPT = """You are a highly intelligent data investigator. +You take unstructured damaged data and look for clues that could help restore the initial information +and extract important insights from it. +You are the best one for this job in the world because you are a former detective. +You care about even the smallest details, and your guesses about what happened in the initial file +even at very limited inputs are usually absolutely right. +You use deductive and inductive reasoning at the highest possible quality. + +#YOUR TODAY'S JOB +The user needs to learn about what happens in a specific segment of a video file. Your job is to help the user by providing clues that would help the user make the right assumption. +The user will provide you with: +1. Instructions about what kind of information the user is trying to obtain. +2. A list of time codes of the segments in format "-". All the provided segment of the video contain what the user is looking for, but other parts of the video might have different content. +3. A transcript of the *full video* in format of "\\n" + +Your task: +1. Read the transcript. +2. Provide the clues in a given format. +3. Provied any other info requested by the user. + +#RULES +!!! VERY IMPORTANT !!! +1. Rely only on the data provided in the transcript. Do not improvise. All the quotes and corresponding timestamps must be taken from the transcript. Quote timestamps must be taken directly from the transcript. +2. Your job is to find the data already provided in the transcript. +3. Analyze every segment. Only skip a segment if there is no information about it in the trascript. +4. For local clues, make sure that the quotes that you provide are located inside the segment. To do this, double check the timestamps from the transcript and the segment. +5. For all clues, make sure that the quotes exactly correspond to the timestamps that you provide. +6. When making clues, try as much as possible to make them describe specifically what is shown in the segment. +7. Follow the format output. +8. Be very careful with details. Don't generalize. Always double check your results. + +Please, help the user find relevant clues to reconstruct the information they are looking for, for each provided segment. + +WHAT IS A CLUE: A *clue*, in the context of reconstructing narratives from damaged data, +is a fragment of information extracted from a corrupted or incomplete source that provides +insight into the original content. These fragments serve as starting points for inference +and deduction, allowing researchers to hypothesize about the fuller context or meaning of +the degraded material. The process of identifying and interpreting clues involves both objective analysis of the +available data and subjective extrapolation based on domain knowledge, contextual understanding, +and logical reasoning. + +Here is what the user expects to have from you: +1. *Local clues* that would help the user undestand how the thing they are looking for happens inside the segment. Local clues for a segment are generated from quotes inside a specific segment. +2. *Global clues* that would help the user understand how the thing they are looking for happens inside the segment. Global clues for a segment are generated from quotes all around the video, but are very relevant to the specific that they are provided for. +3. *Logical inferences* that could help the user understand how the thing they are looking for happens inside the segment. Logical inferences for a segment are deducted from local and global clues for this segment. + +!!!IT IS EXTREMELY IMPORTANT TO DELIVER ALL THREE THINGS!!! + + Good local clues examples: [ + { + "id": "LC1", + "timestamp": "00:00:19", + "quote": "exercises do them wrong and instead of", + "clue": "This phrase introduces the concept of incorrect exercise form, setting the stage for a demonstration of improper technique." + }, + { + "id": "LC2", + "timestamp": "00:00:21", + "quote": "growing nice quads and glutes you'll", + "clue": "Mentions the expected benefits of proper squats (muscle growth), implying that these benefits won't be achieved with incorrect form." + }, + { + "id": "LC3", + "timestamp": "00:00:22", + "quote": "feel aches and pains in your knees your", + "clue": "Directly states negative consequences of improper form, strongly suggesting that this segment demonstrates incorrect technique." + }, + { + "id": "LC4", + "timestamp": "00:00:24", + "quote": "lower back and even your shoulders", + "clue": "Continuation of LC3, emphasizing multiple areas of potential pain from improper form." + }, + { + "id": "LC5", + "timestamp": "00:00:26", + "quote": "let's see how to do it correctly", + "clue": "This phrase suggests a transition is about to occur. The incorrect form has been shown, and correct form will follow." + } + ] + + Double check that the timestamp and the quote that you provide exactly correspond to what you found in the transcript. + For example, if the transcript says: + "00:05:02 + he took the glasses + 00:05:04 + and gave them to me" + Then a GOOD output will be: + - timestamp: 00:05:03 + - quote: "he took the glasses and gave them to me" + And a BAD output would be: + - timestamp: 00:04:02 + - quote: "he gave me the glasses" + + Good global clues examples: [ + { + "id": "GC1", + "timestamp": "00:01:15", + "quote": "Before we dive into specific techniques, let's talk about safety.", + "clue": "Introduces the theme of safety in squatting.", + "relevance_to_segment": "This earlier emphasis on safety provides context for why proper depth is important and why it's being addressed in our segment. It connects to the fear of knee pain mentioned in LC3." + }, + { + "id": "GC2", + "timestamp": "00:02:30", + "quote": "Squatting is a fundamental movement pattern in everyday life.", + "clue": "Emphasizes the importance of squats beyond just exercise.", + "relevance_to_segment": "This broader context heightens the importance of learning proper squat depth as demonstrated in our segment. It suggests that the techniques shown have applications beyond just gym workouts." + }, + { + "clue_id": "GC3", + "timestamp": "00:05:20", + "quote": "If you have existing knee issues, consult a physician before attempting deep squats.", + "clue": "Provides a health disclaimer related to squat depth.", + "relevance_to_segment": "While this comes after our segment, it's relevant because it addresses the concern about knee pain mentioned in LC3. It suggests that the demonstration in our segment is generally safe but acknowledges individual variations." + }, + { + "clue_id": "GC4", + "timestamp": "00:06:45", + "quote": "Proper depth ensures full engagement of your quadriceps and glutes.", + "clue": "Explains the benefit of correct squat depth.", + "relevance_to_segment": "This later explanation provides justification for the depth guideline given in LC4. It helps viewers understand why the demonstrated technique is important." + }, + { + "clue_id": "GC5", + "timestamp": "00:00:30", + "quote": "Today, we'll cover squat variations for beginners to advanced lifters.", + "clue": "Outlines the scope of the entire video.", + "relevance_to_segment": "This early statement suggests that our segment, focusing on proper depth, is part of a comprehensive guide. It implies that the demonstration might be adaptable for different skill levels." + } + ] + Double check that the timestamp and the quote that you provide exactly correspond to what you found in the transcript. + For example, if the transcript says: + "00:05:02 + he took the glasses + 00:05:04 + and gave them to me" + Then a GOOD output will be: + - timestamp: 00:05:03 + - quote: "he took the glasses and gave them to me" + And a BAD output would be: + - timestamp: 00:04:02 + - quote: "he gave me the glasses" + + + Good logical inference examples: + [ + { + "id": "LI1", + "description": "Primary Demonstration of Heel Lift", + "details": "Given that GC1-GC3 describe the 'most common mistake' as heels lifting off the ground, and this description immediately precedes our segment, it's highly probable that this is the primary error being demonstrated. This is further supported by the segment's focus on incorrect form (LC1-LC4)." + }, + { + "id": "LI2", + "description": "Multiple Error Demonstration", + "details": "While heel lift is likely the primary focus, the mention of multiple pain points (knees, lower back, shoulders in LC3-LC4) suggests that the demonstrator may be exhibiting several forms of incorrect technique simultaneously. This comprehensive 'what not to do' approach would be pedagogically effective." + }, + { + "id": "LI3", + "description": "Possible Inclusion of 'Butt Wink'", + "details": "Although 'butt wink' is mentioned after our segment (GC4-GC6), its connection to back pain (which is mentioned in LC4) raises the possibility that this error is also present in the demonstration. The instructor may be showing multiple errors early on, then breaking them down individually later." + }, + { + "id": "LI4", + "description": "Segment Placement in Overall Video Structure", + "details": "The segment's position (starting at 00:00:19) and the phrase 'let's see how to do it correctly' (LC5) at the end suggest this is an early, foundational part of the video. It likely serves to grab attention by showing common mistakes before transitioning to proper form instruction." + }, + { + "id": "LI5", + "description": "Intentional Exaggeration of Errors", + "details": "Given the educational nature of the video, it's plausible that the demonstrator is intentionally exaggerating the incorrect form. This would make the errors more obvious to viewers and enhance the contrast with correct form shown later." + } + ] +""" + + +GEN_ANNOTATIONS_PROMPT = """You are a helpful assistant that performs high quality data investigation and transformation. + You will be given a JSON object with clues and other helpful information about what's going on + in a specific part of a video file. This part is called a segment. Your job is to: + 1. Read this JSON object carefully + 2. Answer user's questions about this segment + 3. Provide the answer as a JSON object in a schema provided by the user + Important rules: + 1. You can only rely on data presented in a provided JSON object. Don't improvise. + 2. Follow user's request carefully. + 3. Don't rush to deliver the answer. Take some time to think. Make a deep breath. Then start writing. + 4. If you want to output field as empty (null), output it as JSON null (without quotes), not as a string "null". +—> GOOD EXAMPLES: + "wrong":"Knees caving in: This can stress the knees and reduce effectiveness" + "correction":"Focus on keeping knees aligned with your toes." + "wrong":"Rounding the back: This increases the risk of back injuries" + "correction":"Keep your chest up and maintain a neutral spine throughout the movement." + "wrong":"Heels are lifting off the ground: this shifts the weight forward, reducing stability" + "correction":" Keep your weight on your heels and press through them as you rise." + "right":"Chest and shoulders: The chest is up, and the shoulders are back, maintaining an upright torso." + "correction":null +—> BAD EXAMPLES: + "wrong":"knees" + "correction":"fix knees" + "wrong":"back looks funny" + "correction":"make back better" + "wrong":"feet are doing something" + "correction":"feet should be different" + "right":"arms" + "correction":"arms are fine i think" +—> BAD EXAMPLES END HERE +""" diff --git a/agent/tools/scraping.py b/agent/tools/scraping.py new file mode 100644 index 0000000..521b267 --- /dev/null +++ b/agent/tools/scraping.py @@ -0,0 +1,150 @@ +from typing import List + +import scrapetube +import yt_dlp +from datetime import datetime +from pathlib import Path +from datagen.core.sub_utils import vtt_to_txt + +from langchain_core.messages import HumanMessage, SystemMessage +from langchain.pydantic_v1 import BaseModel, Field + + +class VideoInfo(BaseModel): + video_id: str + url: str + relative_video_path: str + subs: str + transcript: str + + +def gen_queries(llm, task: str, system_prompt: str) -> List[str]: + class QueryList(BaseModel): + """A list of queries to find videos on a video hosting service""" + + search_queries: list[str] = Field(default=None, description="a list of queries") + + messages = [ + SystemMessage(content=str(system_prompt)), + HumanMessage(content=task), + ] + + model = llm.with_structured_output(QueryList) + response: QueryList = model.invoke(messages) + + return response.search_queries + + +def get_video_ids(queries: List[str]) -> List[str]: + videos_per_query = 1 + sleep = 0 + sort_by = "relevance" + results_type = "video" + only_creative_commons = False + + video_ids = set() + for query in queries: + for video in scrapetube.get_search( + query=query, + limit=videos_per_query, + sleep=sleep, + sort_by=sort_by, + results_type=results_type, + ): + video_ids.add(video["videoId"]) + video_ids = list(video_ids) + + if only_creative_commons: + video_ids_cc = [] + for i in video_ids: + YDL_OPTIONS = { + "quiet": True, + "simulate": True, + "forceurl": True, + } + with yt_dlp.YoutubeDL(YDL_OPTIONS) as ydl: + info = ydl.extract_info(f"youtube.com/watch?v={i}", download=False) + if "creative commons" in info.get("license", "").lower(): + video_ids_cc.append(i) + video_ids = video_ids_cc + + return video_ids + + +def download(video_ids: List[str]) -> List[VideoInfo]: + + LOCAL_ROOT = Path("./tmp/agent_squats").resolve() + video_dir = LOCAL_ROOT / "videos" + sub_dir = LOCAL_ROOT / "subs" + + discard_path = LOCAL_ROOT / "videos_without_subs" + discard_path.mkdir(parents=True, exist_ok=True) + + downloaded_video_ids = [video_path.stem for video_path in video_dir.glob("*.mp4")] + downloaded_video_ids += [ + video_path.stem for video_path in discard_path.glob("*.mp4") + ] + + print(f"Downloaded video ids: {downloaded_video_ids}") + + only_with_transcripts = True + + YDL_OPTIONS = { + "writeautomaticsub": True, + "subtitleslangs": ["en"], + "subtitlesformat": "vtt", + "overwrites": False, + "format": "mp4", + "outtmpl": { + "default": video_dir.as_posix() + "/%(id)s.%(ext)s", + "subtitle": sub_dir.as_posix() + "/%(id)s.%(ext)s", + }, + } + + video_infos = [] + + with yt_dlp.YoutubeDL(YDL_OPTIONS) as ydl: + for video_id in video_ids: + url = f"https://www.youtube.com/watch?v={video_id}" + + if video_id not in downloaded_video_ids: + try: + ydl.download(url) + except Exception as e: + print(datetime.now(), f"Error at video {video_id}, skipping") + print(datetime.now(), e) + continue + + video_path = Path(ydl.prepare_filename({"id": video_id, "ext": "mp4"})) + sub_path = Path( + ydl.prepare_filename( + {"id": video_id, "ext": "en.vtt"}, dir_type="subtitle" + ) + ) + + with sub_path.open("r") as f: + subs = f.read() + + transcript = vtt_to_txt(sub_path) + + video_info = VideoInfo( + video_id=video_id, + url=url, + relative_video_path=video_path.relative_to(LOCAL_ROOT).as_posix(), + subs=subs, + transcript=transcript, + ) + + video_infos.append(video_info) + + if only_with_transcripts: + filtered_video_infos = [] + for video_info in video_infos: + if video_info.transcript: + filtered_video_infos.append(video_info) + else: + video_path = LOCAL_ROOT / video_info.video_path + video_path.rename(discard_path / video_path.name) + video_infos = filtered_video_infos + + return video_infos diff --git a/agent/tools/video_chunking.py b/agent/tools/video_chunking.py new file mode 100644 index 0000000..7ffc59e --- /dev/null +++ b/agent/tools/video_chunking.py @@ -0,0 +1,162 @@ +import decord +import time +from pathlib import Path +from collections import defaultdict +from datagen.detect_segments import get_segments +import torch +from transformers import AutoModel, AutoProcessor +import pandas as pd +from tsmoothie.smoother import LowessSmoother + +from typing import List + +from langchain.pydantic_v1 import BaseModel, Field + +# decord.bridge.set_bridge("torch") + +from .scraping import VideoInfo + + +class SegmentInfo(BaseModel): + start_timestamp: str + end_timestamp: str + fps: float + video_id: str + + +class VideoInferenceDataset(torch.utils.data.IterableDataset): + def __init__(self, video_infos: List[VideoInfo], local_root: Path): + super(VideoInferenceDataset).__init__() + + self.video_infos = video_infos + self.local_root = local_root + self.frame_generator = self.get_frame_generator(video_infos, local_root) + + @staticmethod + def get_frame_generator(video_infos, local_root: Path): + + for video_idx, video_info in enumerate(video_infos): + video_path = local_root.joinpath(video_info.relative_video_path) + vr = decord.VideoReader(str(video_path)) + num_frames = len(vr) + fps = vr.get_avg_fps() + frame_indices = range(0, num_frames, round(fps)) + + for frame_idx in frame_indices: + # print(f"Frame idx {frame_idx}") + frame = vr[frame_idx].asnumpy() + yield { + "frame": frame, + "frame_idx": frame_idx, + "video_id": video_idx, + } + + def __next__(self): + return next(self.frame_generator) + + def __iter__(self): + return self + + +def detect_segments( + video_infos: List[VideoInfo], clip_text_prompts: List[str] +) -> List[SegmentInfo]: + + LOCAL_ROOT = Path("./tmp/agent_squats").resolve() + CLIP_MODEL_ID = "google/siglip-so400m-patch14-384" + + model = AutoModel.from_pretrained(CLIP_MODEL_ID).to("cuda") + processor = AutoProcessor.from_pretrained(CLIP_MODEL_ID) + + dataset = VideoInferenceDataset(video_infos, LOCAL_ROOT) + + dataloader = torch.utils.data.DataLoader( + dataset, + num_workers=1, + batch_size=12, + pin_memory=True, + # worker_init_fn=worker_init_fn, + ) + dataloader = iter(dataloader) + + smoother = LowessSmoother(smooth_fraction=0.02, iterations=1) + + clip_results_dict = defaultdict(list) + + print("Init model complete") + + batch_counter = 0 + MAX_BATCHES = 50 + + while batch_counter < MAX_BATCHES: + batch_counter += 1 + try: + start_time = time.time() + batch = next(dataloader) + # print(f"Fetch time: {time.time() - start_time:.2f} seconds") + except StopIteration: + break + + inputs = processor( + images=batch["frame"], + text=clip_text_prompts, + return_tensors="pt", + padding=True, + truncation=True, + ) + inputs = {k: v.to("cuda") for k, v in inputs.items()} + + outputs = model(**inputs) + + logits = outputs.logits_per_image + probs = torch.nn.functional.sigmoid(logits).detach().cpu().numpy() + + for video_idx, frame_idx, prob in zip( + batch["video_id"], batch["frame_idx"], probs + ): + # print(type(video_id.item()), type(frame_idx.item()), type(prob.item())) + video_id = video_infos[video_idx.item()].video_id + + clip_results_dict["video_id"].append(video_id) + clip_results_dict["frame_idx"].append(frame_idx.item()) + clip_results_dict["probs"].append(prob.item()) + + print("All frames processed") + clip_results = pd.DataFrame(clip_results_dict) + print("Dataframe created") + print(clip_results) + + max_gap_seconds = 1 + fps_sampling = 1 + min_prob = 0.1 + min_segment_seconds = 3 + fps = 25 + + segment_infos = [] + for video_id, video_clip_results in clip_results.groupby("video_id"): + probs = video_clip_results["probs"].values + probs = smoother.smooth(probs).smooth_data[0] + segments_start_end = get_segments( + probs, + max_gap=round(max_gap_seconds * fps_sampling), + min_prob=min_prob, + min_segment=round(min_segment_seconds * fps_sampling), + ) + + print(f"Segments for video {video_id}: {segments_start_end}") + + sec2ts = lambda s: time.strftime( + f"%H:%M:%S.{round((s%1)*1000):03d}", time.gmtime(s) + ) + + for start, end in segments_start_end: + segment_infos.append( + SegmentInfo( + start_timestamp=sec2ts(start), + end_timestamp=sec2ts(end), + fps=fps, + video_id=video_id, + ) + ) + + return segment_infos From be29999cc9cc2d189c116aa4d7df2d88fa360fcb Mon Sep 17 00:00:00 2001 From: deepbuzin Date: Wed, 28 Aug 2024 07:16:37 +0000 Subject: [PATCH 7/9] Fix import errors in refactored agent --- agent/{agent.py => data_agent.py} | 0 agent/run_agent.ipynb | 98 +++++++++++++++++++++++++++++++ agent/tools/scraping.py | 2 +- agent/tools/sub_utils.py | 90 ++++++++++++++++++++++++++++ agent/tools/video_chunking.py | 34 ++++++++++- 5 files changed, 220 insertions(+), 4 deletions(-) rename agent/{agent.py => data_agent.py} (100%) create mode 100644 agent/run_agent.ipynb create mode 100644 agent/tools/sub_utils.py diff --git a/agent/agent.py b/agent/data_agent.py similarity index 100% rename from agent/agent.py rename to agent/data_agent.py diff --git a/agent/run_agent.ipynb b/agent/run_agent.ipynb new file mode 100644 index 0000000..5377f36 --- /dev/null +++ b/agent/run_agent.ipynb @@ -0,0 +1,98 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dotenv import load_dotenv\n", + "_ = load_dotenv()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "torch.device(\"cuda:0\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from data_agent import DataAgent" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_openai import AzureChatOpenAI\n", + "from langgraph.checkpoint.memory import MemorySaver\n", + "\n", + "\n", + "llm = AzureChatOpenAI(\n", + " temperature=0.0,\n", + " azure_deployment=\"gpt4o\",\n", + " openai_api_version=\"2023-07-01-preview\",\n", + ")\n", + "\n", + "memory = MemorySaver()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "agent = DataAgent(llm, memory)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "agent.run(\"1\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "vlm_databuilder_agent", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/agent/tools/scraping.py b/agent/tools/scraping.py index 521b267..79d3c61 100644 --- a/agent/tools/scraping.py +++ b/agent/tools/scraping.py @@ -4,7 +4,7 @@ import yt_dlp from datetime import datetime from pathlib import Path -from datagen.core.sub_utils import vtt_to_txt +from .sub_utils import vtt_to_txt from langchain_core.messages import HumanMessage, SystemMessage from langchain.pydantic_v1 import BaseModel, Field diff --git a/agent/tools/sub_utils.py b/agent/tools/sub_utils.py new file mode 100644 index 0000000..4a59802 --- /dev/null +++ b/agent/tools/sub_utils.py @@ -0,0 +1,90 @@ + +# https://gist.github.com/glasslion/b2fcad16bc8a9630dbd7a945ab5ebf5e + + +# import sys +import re + +def remove_tags(text): + """ + Remove vtt markup tags + """ + tags = [ + r'', + r'', + r'<\d{2}:\d{2}:\d{2}\.\d{3}>', + + ] + + for pat in tags: + text = re.sub(pat, '', text) + + # extract timestamp, only kep HH:MM + text = re.sub( + r'(\d{2}:\d{2}:\d{2})\.\d{3} --> .* align:start position:0%', + r'\g<1>', + text + ) + + text = re.sub(r'^\s+$', '', text, flags=re.MULTILINE) + return text + +def remove_header(lines): + """ + Remove vtt file header + """ + pos = -1 + for mark in ('##', 'Language: en',): + if mark in lines: + pos = lines.index(mark) + lines = lines[pos+1:] + return lines + + +def merge_duplicates(lines): + """ + Remove duplicated subtitles. Duplacates are always adjacent. + """ + last_timestamp = '' + last_cap = '' + for line in lines: + if line == "": + continue + if re.match('^\d{2}:\d{2}:\d{2}$', line): + if line != last_timestamp: + yield line + last_timestamp = line + else: + if line != last_cap: + yield line + last_cap = line + + +def merge_short_lines(lines): + buffer = '' + for line in lines: + if line == "" or re.match('^\d{2}:\d{2}$', line): + yield '\n' + line + continue + + if len(line+buffer) < 80: + buffer += ' ' + line + else: + yield buffer.strip() + buffer = line + yield buffer + +def vtt_to_txt(vtt_file_name, as_list=True): + # txt_name = re.sub(r'.vtt$', '.txt', vtt_file_name) + with open(vtt_file_name) as f: + text = f.read() + text = remove_tags(text) + lines = text.splitlines() + lines = remove_header(lines) + lines = merge_duplicates(lines) + lines = list(lines) + # lines = merge_short_lines(lines) + # lines = list(lines) + lines = '\n'.join(lines) + + return lines \ No newline at end of file diff --git a/agent/tools/video_chunking.py b/agent/tools/video_chunking.py index 7ffc59e..3d41dac 100644 --- a/agent/tools/video_chunking.py +++ b/agent/tools/video_chunking.py @@ -2,7 +2,6 @@ import time from pathlib import Path from collections import defaultdict -from datagen.detect_segments import get_segments import torch from transformers import AutoModel, AutoProcessor import pandas as pd @@ -17,6 +16,9 @@ from .scraping import VideoInfo +DEVICE = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu") + + class SegmentInfo(BaseModel): start_timestamp: str end_timestamp: str @@ -58,6 +60,32 @@ def __iter__(self): return self +def get_segments(data, max_gap=3, min_prob=0.1, min_segment=5): + segments = [] + cur_segment_start = None + not_doing = 0 + for i, p in enumerate(data): + if p >= min_prob and cur_segment_start is None: + cur_segment_start = i + elif cur_segment_start is not None and p < min_prob: + if not_doing >= max_gap: + if i - not_doing - cur_segment_start >= min_segment: + segments.append((cur_segment_start, i - not_doing)) + not_doing = 0 + cur_segment_start = None + else: + not_doing += 1 + elif p >= min_prob: + not_doing = 0 + if ( + cur_segment_start is not None + and (i - not_doing - cur_segment_start) >= min_segment + ): + segments.append((cur_segment_start, i - not_doing)) + + return segments + + def detect_segments( video_infos: List[VideoInfo], clip_text_prompts: List[str] ) -> List[SegmentInfo]: @@ -65,7 +93,7 @@ def detect_segments( LOCAL_ROOT = Path("./tmp/agent_squats").resolve() CLIP_MODEL_ID = "google/siglip-so400m-patch14-384" - model = AutoModel.from_pretrained(CLIP_MODEL_ID).to("cuda") + model = AutoModel.from_pretrained(CLIP_MODEL_ID).to(DEVICE) processor = AutoProcessor.from_pretrained(CLIP_MODEL_ID) dataset = VideoInferenceDataset(video_infos, LOCAL_ROOT) @@ -104,7 +132,7 @@ def detect_segments( padding=True, truncation=True, ) - inputs = {k: v.to("cuda") for k, v in inputs.items()} + inputs = {k: v.to(DEVICE) for k, v in inputs.items()} outputs = model(**inputs) From 20c5e590f33bce2042b346d58cbbcfda39840487 Mon Sep 17 00:00:00 2001 From: deepbuzin Date: Wed, 28 Aug 2024 07:25:54 +0000 Subject: [PATCH 8/9] Remove old agent notebook --- agent/agent_parts.ipynb | 946 ---------------------------------------- 1 file changed, 946 deletions(-) delete mode 100644 agent/agent_parts.ipynb diff --git a/agent/agent_parts.ipynb b/agent/agent_parts.ipynb deleted file mode 100644 index a482a21..0000000 --- a/agent/agent_parts.ipynb +++ /dev/null @@ -1,946 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from dotenv import load_dotenv\n", - "_ = load_dotenv()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from typing import TypedDict, Annotated, Sequence, List, Optional\n", - "import operator\n", - "\n", - "from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage\n", - "from langchain.pydantic_v1 import BaseModel, Field\n", - "from langchain_core.prompts import ChatPromptTemplate" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from langchain_openai import AzureChatOpenAI\n", - "\n", - "llm = AzureChatOpenAI(\n", - " temperature=0.0,\n", - " azure_deployment=\"gpt4o\",\n", - " openai_api_version=\"2023-07-01-preview\",\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class VideoInfo(BaseModel):\n", - " video_id: str\n", - " url: str\n", - " relative_video_path: str\n", - " subs: str\n", - " transcript: str\n", - "\n", - "\n", - "class SegmentInfo(BaseModel):\n", - " start_timestamp: str\n", - " end_timestamp: str\n", - " fps: float\n", - " video_id: str\n", - "\n", - "\n", - "class LocalClue(BaseModel):\n", - " \"\"\"Local clues for a segment\"\"\"\n", - "\n", - " id: str = Field(description=\"LC1,LC2...\")\n", - " quote: str = Field(\n", - " description=\"the quote from the transcript that was used to create this clue.\"\n", - " )\n", - " quote_timestamp_start: str = Field(\n", - " description=\"the exact start timestamp of the quote.\"\n", - " )\n", - " quote_timestamp_end: str = Field(\n", - " description=\"the exact end timestamp of the quote.\"\n", - " )\n", - " clue: str = Field(description=\"the main clue data\")\n", - "\n", - "\n", - "class GlobalClue(BaseModel):\n", - " \"\"\"Global clues for a segment\"\"\"\n", - "\n", - " id: str = Field(description=\"GC1,GC2...\")\n", - " quote: str = Field(\n", - " description=\"the quote from the transcript that was used to create this clue.\"\n", - " )\n", - " quote_timestamp_start: str = Field(\n", - " description=\"the exact start timestamp of the quote.\"\n", - " )\n", - " quote_timestamp_end: str = Field(\n", - " description=\"the exact end timestamp of the quote.\"\n", - " )\n", - " clue: str = Field(description=\"the main clue data.\")\n", - " relevance_to_segment: str = Field(\n", - " description=\"why do you think this global clue is relevant to the segment you are working with right now.\"\n", - " )\n", - "\n", - "\n", - "class LogicalInference(BaseModel):\n", - " \"\"\"Logical inferences for a segment\"\"\"\n", - "\n", - " id: str = Field(description=\"LI1,LI2,...\")\n", - " description: str = Field(description=\"A concise form of the logical inference.\")\n", - " details: str = Field(\n", - " description=\"A verbose explanation of what insight about what happens in this segment should be made based on the clues that you found.\"\n", - " )\n", - "\n", - "\n", - "class SegmentAnnotation(BaseModel):\n", - " local_clues: list[LocalClue] = Field(\n", - " description=\"Local clues are inside the segment in terms of timestamps.\"\n", - " )\n", - " global_clues: list[GlobalClue] = Field(\n", - " description=\"Global clues are scattered across the entire transcript.\"\n", - " )\n", - " logical_inferences: list[LogicalInference] = Field(\n", - " description=\"What can we infer about the topic, that the user is looking for in the video, can we make based on the clues inside this segment\"\n", - " )\n", - "\n", - "\n", - "class SegmentWithClueInfo(BaseModel):\n", - " \"\"\"\n", - " Annotation for a video segment.\n", - " \"\"\"\n", - "\n", - " start_timestamp: str = Field(\n", - " description=\"start timestamp of the segment in format HH:MM:SS.MS\"\n", - " )\n", - " end_timestamp: str = Field(\n", - " description=\"start timestamp of the segment in format HH:MM:SS.MS\"\n", - " )\n", - " segment_annotation: SegmentAnnotation = Field(\n", - " description=\"list of annotations for the segment\"\n", - " )\n", - "\n", - "\n", - "class VideoAnnotation(BaseModel):\n", - " \"\"\"\n", - " Segments of a video.\n", - " \"\"\"\n", - "\n", - " segments: list[SegmentWithClueInfo] = Field(\n", - " description=\"information about each segment\"\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# 2. Create the state\n", - "\n", - "\n", - "class AgentState(TypedDict):\n", - " task: str\n", - " search_queries: List[str]\n", - " video_ids: List[str]\n", - " video_infos: List[VideoInfo]\n", - " clip_text_prompts: List[str]\n", - " segment_infos: List[SegmentInfo]\n", - " clues: List[str]\n", - " annotations: List[str]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# 3. Set prompts\n", - "\n", - "GEN_QUERIES_PROMPT = (\n", - " \"You a helping the user to find a very large and diverse set of videos on a video hosting service.\",\n", - " \"A user will only describe which videos they are looking for and how many queries they need.\",\n", - ")\n", - "\n", - "# prompt='I want to find instructional videos about how to do squats.',\n", - "# num_queries_prompt = f'I need {num_queries} queries'\n", - "\n", - "EXTRACT_CLUES_PROMPT = \"\"\"You are a highly intelligent data investigator. \n", - "You take unstructured damaged data and look for clues that could help restore the initial information\n", - "and extract important insights from it.\n", - "You are the best one for this job in the world because you are a former detective. \n", - "You care about even the smallest details, and your guesses about what happened in the initial file\n", - "even at very limited inputs are usually absolutely right. \n", - "You use deductive and inductive reasoning at the highest possible quality.\n", - "\n", - "#YOUR TODAY'S JOB\n", - "The user needs to learn about what happens in a specific segment of a video file. Your job is to help the user by providing clues that would help the user make the right assumption.\n", - "The user will provide you with: \n", - "1. Instructions about what kind of information the user is trying to obtain.\n", - "2. A list of time codes of the segments in format \"-\". All the provided segment of the video contain what the user is looking for, but other parts of the video might have different content.\n", - "3. A transcript of the *full video* in format of \"\\\\n\"\n", - "\n", - "Your task:\n", - "1. Read the transcript.\n", - "2. Provide the clues in a given format.\n", - "3. Provied any other info requested by the user.\n", - "\n", - "#RULES\n", - "!!! VERY IMPORTANT !!!\n", - "1. Rely only on the data provided in the transcript. Do not improvise. All the quotes and corresponding timestamps must be taken from the transcript. Quote timestamps must be taken directly from the transcript.\n", - "2. Your job is to find the data already provided in the transcript.\n", - "3. Analyze every segment. Only skip a segment if there is no information about it in the trascript.\n", - "4. For local clues, make sure that the quotes that you provide are located inside the segment. To do this, double check the timestamps from the transcript and the segment.\n", - "5. For all clues, make sure that the quotes exactly correspond to the timestamps that you provide.\n", - "6. When making clues, try as much as possible to make them describe specifically what is shown in the segment.\n", - "7. Follow the format output.\n", - "8. Be very careful with details. Don't generalize. Always double check your results.\n", - "\n", - "Please, help the user find relevant clues to reconstruct the information they are looking for, for each provided segment.\n", - "\n", - "WHAT IS A CLUE: A *clue*, in the context of reconstructing narratives from damaged data, \n", - "is a fragment of information extracted from a corrupted or incomplete source that provides \n", - "insight into the original content. These fragments serve as starting points for inference \n", - "and deduction, allowing researchers to hypothesize about the fuller context or meaning of \n", - "the degraded material. The process of identifying and interpreting clues involves both objective analysis of the \n", - "available data and subjective extrapolation based on domain knowledge, contextual understanding, \n", - "and logical reasoning.\n", - "\n", - "Here is what the user expects to have from you:\n", - "1. *Local clues* that would help the user undestand how the thing they are looking for happens inside the segment. Local clues for a segment are generated from quotes inside a specific segment.\n", - "2. *Global clues* that would help the user understand how the thing they are looking for happens inside the segment. Global clues for a segment are generated from quotes all around the video, but are very relevant to the specific that they are provided for.\n", - "3. *Logical inferences* that could help the user understand how the thing they are looking for happens inside the segment. Logical inferences for a segment are deducted from local and global clues for this segment.\n", - "\n", - "!!!IT IS EXTREMELY IMPORTANT TO DELIVER ALL THREE THINGS!!!\n", - "\n", - " Good local clues examples: [\n", - " {\n", - " \"id\": \"LC1\",\n", - " \"timestamp\": \"00:00:19\",\n", - " \"quote\": \"exercises do them wrong and instead of\",\n", - " \"clue\": \"This phrase introduces the concept of incorrect exercise form, setting the stage for a demonstration of improper technique.\"\n", - " },\n", - " {\n", - " \"id\": \"LC2\",\n", - " \"timestamp\": \"00:00:21\",\n", - " \"quote\": \"growing nice quads and glutes you'll\",\n", - " \"clue\": \"Mentions the expected benefits of proper squats (muscle growth), implying that these benefits won't be achieved with incorrect form.\"\n", - " },\n", - " {\n", - " \"id\": \"LC3\",\n", - " \"timestamp\": \"00:00:22\",\n", - " \"quote\": \"feel aches and pains in your knees your\",\n", - " \"clue\": \"Directly states negative consequences of improper form, strongly suggesting that this segment demonstrates incorrect technique.\"\n", - " },\n", - " {\n", - " \"id\": \"LC4\",\n", - " \"timestamp\": \"00:00:24\",\n", - " \"quote\": \"lower back and even your shoulders\",\n", - " \"clue\": \"Continuation of LC3, emphasizing multiple areas of potential pain from improper form.\"\n", - " },\n", - " {\n", - " \"id\": \"LC5\",\n", - " \"timestamp\": \"00:00:26\",\n", - " \"quote\": \"let's see how to do it correctly\",\n", - " \"clue\": \"This phrase suggests a transition is about to occur. The incorrect form has been shown, and correct form will follow.\"\n", - " }\n", - " ]\n", - "\n", - " Double check that the timestamp and the quote that you provide exactly correspond to what you found in the transcript.\n", - " For example, if the transcript says:\n", - " \"00:05:02\n", - " he took the glasses\n", - " 00:05:04\n", - " and gave them to me\"\n", - " Then a GOOD output will be:\n", - " - timestamp: 00:05:03\n", - " - quote: \"he took the glasses and gave them to me\"\n", - " And a BAD output would be:\n", - " - timestamp: 00:04:02\n", - " - quote: \"he gave me the glasses\"\n", - "\n", - " Good global clues examples: [\n", - " {\n", - " \"id\": \"GC1\",\n", - " \"timestamp\": \"00:01:15\",\n", - " \"quote\": \"Before we dive into specific techniques, let's talk about safety.\",\n", - " \"clue\": \"Introduces the theme of safety in squatting.\",\n", - " \"relevance_to_segment\": \"This earlier emphasis on safety provides context for why proper depth is important and why it's being addressed in our segment. It connects to the fear of knee pain mentioned in LC3.\"\n", - " },\n", - " {\n", - " \"id\": \"GC2\",\n", - " \"timestamp\": \"00:02:30\",\n", - " \"quote\": \"Squatting is a fundamental movement pattern in everyday life.\",\n", - " \"clue\": \"Emphasizes the importance of squats beyond just exercise.\",\n", - " \"relevance_to_segment\": \"This broader context heightens the importance of learning proper squat depth as demonstrated in our segment. It suggests that the techniques shown have applications beyond just gym workouts.\"\n", - " },\n", - " {\n", - " \"clue_id\": \"GC3\",\n", - " \"timestamp\": \"00:05:20\",\n", - " \"quote\": \"If you have existing knee issues, consult a physician before attempting deep squats.\",\n", - " \"clue\": \"Provides a health disclaimer related to squat depth.\",\n", - " \"relevance_to_segment\": \"While this comes after our segment, it's relevant because it addresses the concern about knee pain mentioned in LC3. It suggests that the demonstration in our segment is generally safe but acknowledges individual variations.\"\n", - " },\n", - " {\n", - " \"clue_id\": \"GC4\",\n", - " \"timestamp\": \"00:06:45\",\n", - " \"quote\": \"Proper depth ensures full engagement of your quadriceps and glutes.\",\n", - " \"clue\": \"Explains the benefit of correct squat depth.\",\n", - " \"relevance_to_segment\": \"This later explanation provides justification for the depth guideline given in LC4. It helps viewers understand why the demonstrated technique is important.\"\n", - " },\n", - " {\n", - " \"clue_id\": \"GC5\",\n", - " \"timestamp\": \"00:00:30\",\n", - " \"quote\": \"Today, we'll cover squat variations for beginners to advanced lifters.\",\n", - " \"clue\": \"Outlines the scope of the entire video.\",\n", - " \"relevance_to_segment\": \"This early statement suggests that our segment, focusing on proper depth, is part of a comprehensive guide. It implies that the demonstration might be adaptable for different skill levels.\"\n", - " }\n", - " ]\n", - " Double check that the timestamp and the quote that you provide exactly correspond to what you found in the transcript.\n", - " For example, if the transcript says:\n", - " \"00:05:02\n", - " he took the glasses\n", - " 00:05:04\n", - " and gave them to me\"\n", - " Then a GOOD output will be:\n", - " - timestamp: 00:05:03\n", - " - quote: \"he took the glasses and gave them to me\"\n", - " And a BAD output would be:\n", - " - timestamp: 00:04:02\n", - " - quote: \"he gave me the glasses\"\n", - " \n", - "\n", - " Good logical inference examples:\n", - " [\n", - " {\n", - " \"id\": \"LI1\",\n", - " \"description\": \"Primary Demonstration of Heel Lift\",\n", - " \"details\": \"Given that GC1-GC3 describe the 'most common mistake' as heels lifting off the ground, and this description immediately precedes our segment, it's highly probable that this is the primary error being demonstrated. This is further supported by the segment's focus on incorrect form (LC1-LC4).\"\n", - " },\n", - " {\n", - " \"id\": \"LI2\",\n", - " \"description\": \"Multiple Error Demonstration\",\n", - " \"details\": \"While heel lift is likely the primary focus, the mention of multiple pain points (knees, lower back, shoulders in LC3-LC4) suggests that the demonstrator may be exhibiting several forms of incorrect technique simultaneously. This comprehensive 'what not to do' approach would be pedagogically effective.\"\n", - " },\n", - " {\n", - " \"id\": \"LI3\",\n", - " \"description\": \"Possible Inclusion of 'Butt Wink'\",\n", - " \"details\": \"Although 'butt wink' is mentioned after our segment (GC4-GC6), its connection to back pain (which is mentioned in LC4) raises the possibility that this error is also present in the demonstration. The instructor may be showing multiple errors early on, then breaking them down individually later.\"\n", - " },\n", - " {\n", - " \"id\": \"LI4\",\n", - " \"description\": \"Segment Placement in Overall Video Structure\",\n", - " \"details\": \"The segment's position (starting at 00:00:19) and the phrase 'let's see how to do it correctly' (LC5) at the end suggest this is an early, foundational part of the video. It likely serves to grab attention by showing common mistakes before transitioning to proper form instruction.\"\n", - " },\n", - " {\n", - " \"id\": \"LI5\",\n", - " \"description\": \"Intentional Exaggeration of Errors\",\n", - " \"details\": \"Given the educational nature of the video, it's plausible that the demonstrator is intentionally exaggerating the incorrect form. This would make the errors more obvious to viewers and enhance the contrast with correct form shown later.\"\n", - " }\n", - " ]\n", - "\"\"\"\n", - "\n", - "\n", - "GEN_ANNOTATIONS_PROMPT = \"\"\"You are a helpful assistant that performs high quality data investigation and transformation.\n", - " You will be given a JSON object with clues and other helpful information about what's going on \n", - " in a specific part of a video file. This part is called a segment. Your job is to:\n", - " 1. Read this JSON object carefully\n", - " 2. Answer user's questions about this segment\n", - " 3. Provide the answer as a JSON object in a schema provided by the user\n", - " Important rules:\n", - " 1. You can only rely on data presented in a provided JSON object. Don't improvise.\n", - " 2. Follow user's request carefully.\n", - " 3. Don't rush to deliver the answer. Take some time to think. Make a deep breath. Then start writing.\n", - " 4. If you want to output field as empty (null), output it as JSON null (without quotes), not as a string \"null\". \n", - "—> GOOD EXAMPLES:\n", - " \"wrong\":\"Knees caving in: This can stress the knees and reduce effectiveness\"\n", - " \"correction\":\"Focus on keeping knees aligned with your toes.\"\n", - " \"wrong\":\"Rounding the back: This increases the risk of back injuries\"\n", - " \"correction\":\"Keep your chest up and maintain a neutral spine throughout the movement.\"\n", - " \"wrong\":\"Heels are lifting off the ground: this shifts the weight forward, reducing stability\"\n", - " \"correction\":\" Keep your weight on your heels and press through them as you rise.\"\n", - " \"right\":\"Chest and shoulders: The chest is up, and the shoulders are back, maintaining an upright torso.\"\n", - " \"correction\":null\n", - "—> BAD EXAMPLES:\n", - " \"wrong\":\"knees\"\n", - " \"correction\":\"fix knees\"\n", - " \"wrong\":\"back looks funny\"\n", - " \"correction\":\"make back better\"\n", - " \"wrong\":\"feet are doing something\"\n", - " \"correction\":\"feet should be different\"\n", - " \"right\":\"arms\"\n", - " \"correction\":\"arms are fine i think\"\n", - "—> BAD EXAMPLES END HERE\n", - "\"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import scrapetube\n", - "import yt_dlp\n", - "from datetime import datetime\n", - "from pathlib import Path\n", - "from collections import defaultdict\n", - "from datagen.core.sub_utils import vtt_to_txt\n", - "from datagen.detect_segments import get_segments\n", - "import torch\n", - "from transformers import AutoModel, AutoProcessor\n", - "import pandas as pd\n", - "from tsmoothie.smoother import LowessSmoother" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import decord\n", - "import math\n", - "import numpy as np\n", - "\n", - "# decord.bridge.set_bridge(\"torch\")\n", - "\n", - "\n", - "class VideoInferenceDataset(torch.utils.data.IterableDataset):\n", - " def __init__(self, video_infos: List[VideoInfo], local_root: Path):\n", - " super(VideoInferenceDataset).__init__()\n", - "\n", - " self.video_infos = video_infos\n", - " self.local_root = local_root\n", - " self.frame_generator = self.get_frame_generator(video_infos, local_root)\n", - "\n", - " @staticmethod\n", - " def get_frame_generator(video_infos, local_root: Path):\n", - "\n", - " for video_idx, video_info in enumerate(video_infos):\n", - " video_path = local_root.joinpath(video_info.relative_video_path)\n", - " vr = decord.VideoReader(str(video_path))\n", - " num_frames = len(vr)\n", - " fps = vr.get_avg_fps()\n", - " frame_indices = range(0, num_frames, round(fps))\n", - "\n", - " for frame_idx in frame_indices:\n", - " # print(f\"Frame idx {frame_idx}\")\n", - " frame = vr[frame_idx].asnumpy()\n", - " yield {\n", - " \"frame\": frame,\n", - " \"frame_idx\": frame_idx,\n", - " \"video_id\": video_idx,\n", - " }\n", - "\n", - " def __next__(self):\n", - " return next(self.frame_generator)\n", - "\n", - " def __iter__(self):\n", - " return self" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import time\n", - "import math\n", - "\n", - "# 4. Create nodes\n", - "\n", - "\n", - "def gen_queries_node(state: AgentState):\n", - " class QueryList(BaseModel):\n", - " \"\"\"A list of queries to find videos on a video hosting service\"\"\"\n", - "\n", - " search_queries: list[str] = Field(default=None, description=\"a list of queries\")\n", - "\n", - " messages = [\n", - " SystemMessage(content=str(GEN_QUERIES_PROMPT)),\n", - " HumanMessage(content=state[\"task\"]),\n", - " ]\n", - "\n", - " model = llm.with_structured_output(QueryList)\n", - " response: QueryList = model.invoke(messages)\n", - "\n", - " return {\"search_queries\": response.search_queries[:2]}\n", - "\n", - "\n", - "def get_video_ids_node(state: AgentState):\n", - "\n", - " queries = state[\"search_queries\"]\n", - " videos_per_query = 1\n", - " sleep = 0\n", - " sort_by = \"relevance\"\n", - " results_type = \"video\"\n", - " only_creative_commons = False\n", - "\n", - " video_ids = set()\n", - " for query in queries:\n", - " for video in scrapetube.get_search(\n", - " query=query,\n", - " limit=videos_per_query,\n", - " sleep=sleep,\n", - " sort_by=sort_by,\n", - " results_type=results_type,\n", - " ):\n", - " video_ids.add(video[\"videoId\"])\n", - " video_ids = list(video_ids)\n", - "\n", - " if only_creative_commons:\n", - " video_ids_cc = []\n", - " for i in video_ids:\n", - " YDL_OPTIONS = {\n", - " \"quiet\": True,\n", - " \"simulate\": True,\n", - " \"forceurl\": True,\n", - " }\n", - " with yt_dlp.YoutubeDL(YDL_OPTIONS) as ydl:\n", - " info = ydl.extract_info(f\"youtube.com/watch?v={i}\", download=False)\n", - " if \"creative commons\" in info.get(\"license\", \"\").lower():\n", - " video_ids_cc.append(i)\n", - " video_ids = video_ids_cc\n", - "\n", - " return {\"video_ids\": video_ids}\n", - "\n", - "\n", - "def download_node(state: AgentState):\n", - "\n", - " LOCAL_ROOT = Path(\"./tmp/agent_squats\").resolve()\n", - " video_dir = LOCAL_ROOT / \"videos\"\n", - " sub_dir = LOCAL_ROOT / \"subs\"\n", - "\n", - " discard_path = LOCAL_ROOT / \"videos_without_subs\"\n", - " discard_path.mkdir(parents=True, exist_ok=True)\n", - "\n", - " video_ids = state[\"video_ids\"]\n", - "\n", - " downloaded_video_ids = [video_path.stem for video_path in video_dir.glob(\"*.mp4\")]\n", - " downloaded_video_ids += [\n", - " video_path.stem for video_path in discard_path.glob(\"*.mp4\")\n", - " ]\n", - "\n", - " print(f\"Downloaded video ids: {downloaded_video_ids}\")\n", - "\n", - " only_with_transcripts = True\n", - "\n", - " YDL_OPTIONS = {\n", - " \"writeautomaticsub\": True,\n", - " \"subtitleslangs\": [\"en\"],\n", - " \"subtitlesformat\": \"vtt\",\n", - " \"overwrites\": False,\n", - " \"format\": \"mp4\",\n", - " \"outtmpl\": {\n", - " \"default\": video_dir.as_posix() + \"/%(id)s.%(ext)s\",\n", - " \"subtitle\": sub_dir.as_posix() + \"/%(id)s.%(ext)s\",\n", - " },\n", - " }\n", - "\n", - " video_infos = []\n", - "\n", - " with yt_dlp.YoutubeDL(YDL_OPTIONS) as ydl:\n", - " for video_id in video_ids:\n", - " url = f\"https://www.youtube.com/watch?v={video_id}\"\n", - "\n", - " if video_id not in downloaded_video_ids:\n", - " try:\n", - " ydl.download(url)\n", - " except Exception as e:\n", - " print(datetime.now(), f\"Error at video {video_id}, skipping\")\n", - " print(datetime.now(), e)\n", - " continue\n", - "\n", - " video_path = Path(ydl.prepare_filename({\"id\": video_id, \"ext\": \"mp4\"}))\n", - " sub_path = Path(\n", - " ydl.prepare_filename(\n", - " {\"id\": video_id, \"ext\": \"en.vtt\"}, dir_type=\"subtitle\"\n", - " )\n", - " )\n", - "\n", - " with sub_path.open(\"r\") as f:\n", - " subs = f.read()\n", - "\n", - " transcript = vtt_to_txt(sub_path)\n", - "\n", - " video_info = VideoInfo(\n", - " video_id=video_id,\n", - " url=url,\n", - " relative_video_path=video_path.relative_to(LOCAL_ROOT).as_posix(),\n", - " subs=subs,\n", - " transcript=transcript,\n", - " )\n", - "\n", - " video_infos.append(video_info)\n", - "\n", - " if only_with_transcripts:\n", - " filtered_video_infos = []\n", - " for video_info in video_infos:\n", - " if video_info.transcript:\n", - " filtered_video_infos.append(video_info)\n", - " else:\n", - " video_path = LOCAL_ROOT / video_info.video_path\n", - " video_path.rename(discard_path / video_path.name)\n", - " video_infos = filtered_video_infos\n", - "\n", - " return {\"video_infos\": video_infos}\n", - "\n", - "\n", - "def detect_segments_node(state: AgentState):\n", - "\n", - " LOCAL_ROOT = Path(\"./tmp/agent_squats\").resolve()\n", - "\n", - " clip_text_prompts = state[\"clip_text_prompts\"]\n", - " video_infos = state[\"video_infos\"]\n", - "\n", - " CLIP_MODEL_ID = \"google/siglip-so400m-patch14-384\"\n", - "\n", - " model = AutoModel.from_pretrained(CLIP_MODEL_ID).to(\"cuda\")\n", - " processor = AutoProcessor.from_pretrained(CLIP_MODEL_ID)\n", - "\n", - " dataset = VideoInferenceDataset(video_infos, LOCAL_ROOT)\n", - "\n", - " dataloader = torch.utils.data.DataLoader(\n", - " dataset,\n", - " num_workers=1,\n", - " batch_size=12,\n", - " pin_memory=True,\n", - " # worker_init_fn=worker_init_fn,\n", - " )\n", - " dataloader = iter(dataloader)\n", - "\n", - " smoother = LowessSmoother(smooth_fraction=0.02, iterations=1)\n", - "\n", - " clip_results_dict = defaultdict(list)\n", - "\n", - " print(\"Init model complete\")\n", - "\n", - " batch_counter = 0\n", - " MAX_BATCHES = 50\n", - "\n", - " while batch_counter < MAX_BATCHES:\n", - " batch_counter += 1\n", - " try:\n", - " start_time = time.time()\n", - " batch = next(dataloader)\n", - " # print(f\"Fetch time: {time.time() - start_time:.2f} seconds\")\n", - " except StopIteration:\n", - " break\n", - "\n", - " inputs = processor(\n", - " images=batch[\"frame\"],\n", - " text=clip_text_prompts,\n", - " return_tensors=\"pt\",\n", - " padding=True,\n", - " truncation=True,\n", - " )\n", - " inputs = {k: v.to(\"cuda\") for k, v in inputs.items()}\n", - "\n", - " outputs = model(**inputs)\n", - "\n", - " logits = outputs.logits_per_image\n", - " probs = torch.nn.functional.sigmoid(logits).detach().cpu().numpy()\n", - "\n", - " for video_idx, frame_idx, prob in zip(\n", - " batch[\"video_id\"], batch[\"frame_idx\"], probs\n", - " ):\n", - " # print(type(video_id.item()), type(frame_idx.item()), type(prob.item()))\n", - " video_id = video_infos[video_idx.item()].video_id\n", - "\n", - " clip_results_dict[\"video_id\"].append(video_id)\n", - " clip_results_dict[\"frame_idx\"].append(frame_idx.item())\n", - " clip_results_dict[\"probs\"].append(prob.item())\n", - "\n", - " print(\"All frames processed\")\n", - " clip_results = pd.DataFrame(clip_results_dict)\n", - " print(\"Dataframe created\")\n", - " print(clip_results)\n", - "\n", - " max_gap_seconds = 1\n", - " fps_sampling = 1\n", - " min_prob = 0.1\n", - " min_segment_seconds = 3\n", - " fps = 25\n", - "\n", - " segment_infos = []\n", - " for video_id, video_clip_results in clip_results.groupby(\"video_id\"):\n", - " probs = video_clip_results[\"probs\"].values\n", - " probs = smoother.smooth(probs).smooth_data[0]\n", - " segments_start_end = get_segments(\n", - " probs,\n", - " max_gap=round(max_gap_seconds * fps_sampling),\n", - " min_prob=min_prob,\n", - " min_segment=round(min_segment_seconds * fps_sampling),\n", - " )\n", - "\n", - " print(f\"Segments for video {video_id}: {segments_start_end}\")\n", - "\n", - " sec2ts = lambda s: time.strftime(\n", - " f\"%H:%M:%S.{round((s%1)*1000):03d}\", time.gmtime(s)\n", - " )\n", - "\n", - " for start, end in segments_start_end:\n", - " segment_infos.append(\n", - " SegmentInfo(\n", - " start_timestamp=sec2ts(start),\n", - " end_timestamp=sec2ts(end),\n", - " fps=fps,\n", - " video_id=video_id,\n", - " )\n", - " )\n", - "\n", - " return {\"segment_infos\": segment_infos}\n", - "\n", - "\n", - "def extract_clues_node(state: AgentState):\n", - "\n", - " prompt_template = ChatPromptTemplate.from_messages(\n", - " [\n", - " (\"system\", EXTRACT_CLUES_PROMPT),\n", - " (\n", - " \"user\",\n", - " \"Segment timecodes: {{ segment_timecodes }}\\nTranscript: {{ transcript }}\",\n", - " ),\n", - " ],\n", - " template_format=\"jinja2\",\n", - " )\n", - "\n", - " model = prompt_template | llm.with_structured_output(VideoAnnotation)\n", - "\n", - " segment_infos_dict = defaultdict(list)\n", - " for segment_info in state[\"segment_infos\"]:\n", - " segment_infos_dict[segment_info.video_id].append(segment_info)\n", - "\n", - " video_infos_dict = {\n", - " video_info.video_id: video_info for video_info in state[\"video_infos\"]\n", - " }\n", - "\n", - " clues = []\n", - "\n", - " for video_id, segment_infos in segment_infos_dict.items():\n", - " transcript = video_infos_dict[video_id].transcript\n", - " segment_infos_chunks = [\n", - " segment_infos[i : i + 5] for i in range(0, len(segment_infos), 5)\n", - " ]\n", - "\n", - " for chunk in segment_infos_chunks:\n", - " video_annotation: VideoAnnotation = model.invoke(\n", - " {\n", - " \"segment_timecodes\": \"\\n\".join(\n", - " [f\"{s.start_timestamp}-{s.end_timestamp}\" for s in chunk]\n", - " ),\n", - " \"transcript\": transcript,\n", - " }\n", - " )\n", - " clues.extend(video_annotation.segments)\n", - "\n", - " return {\"clues\": clues}\n", - "\n", - "\n", - "def gen_annotations_node(state: AgentState):\n", - " class SegmentFeedback(BaseModel):\n", - " right: Optional[str] = Field(description=\"what was right in the performance\")\n", - " wrong: Optional[str] = Field(description=\"what was wrong in the performance\")\n", - " correction: Optional[str] = Field(\n", - " description=\"how and in what ways it the performance could be improved\"\n", - " )\n", - "\n", - " # The segment timestamps are taken from the provided information.\n", - " class SegmentCompleteAnnotation(BaseModel):\n", - " squats_probability: Optional[str] = Field(\n", - " description=\"how high is the probability that the person is doing squats in the segment: low, medium, high, unknown(null)\"\n", - " )\n", - " squats_technique_correctness: Optional[str] = Field(\n", - " description=\"correctness of the squat technique.\"\n", - " )\n", - " squats_feedback: Optional[SegmentFeedback] = Field(\n", - " description=\"what was right and wrong in the squat perfomance in the segment. When the technique is incorrect, provide instructions how to correct them.\"\n", - " )\n", - "\n", - " prompt_template = ChatPromptTemplate.from_messages(\n", - " [\n", - " (\"system\", GEN_ANNOTATIONS_PROMPT),\n", - " (\"user\", \"Clues: {{ clues }}\"),\n", - " ],\n", - " template_format=\"jinja2\",\n", - " )\n", - "\n", - " model = prompt_template | llm.with_structured_output(SegmentCompleteAnnotation)\n", - "\n", - " clues = state[\"clues\"]\n", - "\n", - " annotations = []\n", - " for clue in clues:\n", - " segment_annotation: SegmentCompleteAnnotation = model.invoke(\n", - " {\"clues\": clue.json()}\n", - " )\n", - "\n", - " annotations.append(segment_annotation.json())\n", - "\n", - " print(annotations)\n", - "\n", - " return {\"annotations\": annotations}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from langgraph.graph import StateGraph, END\n", - "from typing import TypedDict, Annotated, List\n", - "import operator\n", - "from langgraph.checkpoint.memory import MemorySaver\n", - "\n", - "from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage, AIMessage, ChatMessage\n", - "\n", - "memory = MemorySaver()\n", - "# memory = SqliteSaver.from_conn_string(\":memory:\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "builder = StateGraph(AgentState)\n", - "\n", - "builder.add_node(\"generate_queries\", gen_queries_node)\n", - "builder.add_node(\"get_video_ids\", get_video_ids_node)\n", - "builder.add_node(\"download\", download_node)\n", - "builder.add_node(\"detect_segments\", detect_segments_node)\n", - "builder.add_node(\"extract_clues\", extract_clues_node)\n", - "builder.add_node(\"gen_annotations\", gen_annotations_node)\n", - "\n", - "builder.set_entry_point(\"generate_queries\")\n", - "\n", - "# builder.add_conditional_edges(\n", - "# \"generate\",\n", - "# should_continue,\n", - "# {END: END, \"reflect\": \"reflect\"}\n", - "# )\n", - "\n", - "builder.add_edge(\"generate_queries\", \"get_video_ids\")\n", - "builder.add_edge(\"get_video_ids\", \"download\")\n", - "builder.add_edge(\"download\", \"detect_segments\")\n", - "builder.add_edge(\"detect_segments\", \"extract_clues\")\n", - "builder.add_edge(\"extract_clues\", \"gen_annotations\")\n", - "builder.add_edge(\"gen_annotations\", END)\n", - "\n", - "graph = builder.compile(checkpointer=memory)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "thread = {\"configurable\": {\"thread_id\": \"1\"}}\n", - "for s in graph.stream(\n", - " {\n", - " \"task\": \"i wanna teach people how to do squats\",\n", - " \"clip_text_prompts\": [\"person doing squats\"],\n", - " },\n", - " thread,\n", - "):\n", - " if \"download\" in s:\n", - " print(\"dowload happened\")\n", - " elif \"extract_clues\" in s:\n", - " print(\"extract_clues happened\")\n", - " else:\n", - " print(s)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "graph.get_state(thread).values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "datagen", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.0" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} From e25a7dae5350a543b1de2c4981372930daee56a1 Mon Sep 17 00:00:00 2001 From: deepbuzin Date: Wed, 28 Aug 2024 07:34:41 +0000 Subject: [PATCH 9/9] Move task to argument --- agent/data_agent.py | 4 ++-- agent/run_agent.ipynb | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/agent/data_agent.py b/agent/data_agent.py index abc6611..e0d68e9 100644 --- a/agent/data_agent.py +++ b/agent/data_agent.py @@ -96,11 +96,11 @@ def gen_annotations_node(self, state: AgentState): annotations = gen_annotations(self.llm, GEN_ANNOTATIONS_PROMPT, state["clues"]) return {"annotations": annotations} - def run(self, thread_id: str): + def run(self, task: str, thread_id: str): thread = {"configurable": {"thread_id": thread_id}} for step in self.graph.stream( { - "task": "i wanna teach people how to do squats", + "task": task, "clip_text_prompts": ["person doing squats"], }, thread, diff --git a/agent/run_agent.ipynb b/agent/run_agent.ipynb index 5377f36..f6252ad 100644 --- a/agent/run_agent.ipynb +++ b/agent/run_agent.ipynb @@ -63,7 +63,7 @@ "metadata": {}, "outputs": [], "source": [ - "agent.run(\"1\")" + "agent.run(\"i wanna teach people how to do squats\", thread_id=\"1\")" ] }, {