berkeley-function-calling-leaderboard

version

1.0.0

license

apache-2.0

usage

unrestricted

languages

eng

format

json

channel

sampling rate

bit depth

duration

0 days 00:00:00

files

5251, duration distribution: each file is 0.0 s

repository

audb-public

Description

The Berkeley function calling leaderboard is a live leaderboard to evaluate the ability of different LLMs to call functions (also referred to as tools). We built this dataset from our learnings to be representative of most users’ function calling use-cases, for example, in agents, as a part of enterprise workflows, etc. To this end, our evaluation dataset spans diverse categories, and across multiple languages. Checkout the Leaderboard at gorilla.cs.berkeley.edu/leaderboard.html and further info at https://huggingface.co/datasets/gorilla-llm/Berkeley-Function-Calling-Leaderboard.

Example

json/bfcl-v3-exec-simple/sample-89.json

[
  {
    "role": "system",
    "tools": [
      {
        "type": "function",
        "function": {
          "name": "calculate_nutritional_needs",
          "description": "Calculates the nutritional needs of a person based on their weight, height, age, gender, activity level, and goal.",
          "parameters": {
            "type": "object",
            "properties": {
              "weight": {
                "type": "number",
                "description": "The weight of the person in kilograms."
              },
              "height": {
                "type": "number",
                "description": "The height of the person in centimeters."
              },
              "age": {
                "type": "number",
                "description": "The age of the person in years."
              },
              "gender": {
                "type": "string",
                "description": "The gender of the person. Possible options [male, female, other]."
              },
              "activity_level": {
                "type": "integer",
                "description": "The activity level of the person. Possible options [1,2,3,4,5]."
              },
              "goal": {
                "type": "string",
                "description": "The goal of the person. Possible options [lose, gain, maintain]."
              }
            },
            "required": [
              "weight",
              "height",
              "age",
              "gender",
              "activity_level",
              "goal"
            ]
          }
        }
      }
    ]
  },
  {
    "role": "human",
    "text": "I have an 80-year-old female client who is 170 cm tall, weighs 59 kg, and is quite active with an activity level of 4. She's looking to reduce her weight. Could you calculate her daily nutritional needs based on these details?"
  },
  {
    "role": "assistant",
    "tool_calls": [
      {
        "type": "function",
        "function": {
          "name": "calculate_nutritional_needs",
          "arguments": {
            "weight": 59,
            "height": 170,
            "age": 80,
            "gender": "female",
            "activity_level": 4,
            "goal": "lose"
          }
        }
      }
    ],
    "meta": {
      "source": "truth"
    }
  }
]

Tables

Click on a row to toggle a preview.

ID

Type

Columns

bfcl-v3-chatable

filewise

topic, turns

file

topic

turns

json/bfcl-v3-chatable/sample-0.json

chatable

1

json/bfcl-v3-chatable/sample-1.json

chatable

1

json/bfcl-v3-chatable/sample-2.json

chatable

1

json/bfcl-v3-chatable/sample-3.json

chatable

1

json/bfcl-v3-chatable/sample-4.json

chatable

1

200 rows x 2 columns

bfcl-v3-exec-multiple

filewise

topic, turns

file

topic

turns

json/bfcl-v3-exec-multiple/sample-0.json

exec-multiple

3

json/bfcl-v3-exec-multiple/sample-1.json

exec-multiple

3

json/bfcl-v3-exec-multiple/sample-2.json

exec-multiple

3

json/bfcl-v3-exec-multiple/sample-3.json

exec-multiple

3

json/bfcl-v3-exec-multiple/sample-4.json

exec-multiple

3

50 rows x 2 columns

bfcl-v3-exec-parallel

filewise

topic, turns

file

topic

turns

json/bfcl-v3-exec-parallel/sample-0.json

exec-parallel

3

json/bfcl-v3-exec-parallel/sample-1.json

exec-parallel

3

json/bfcl-v3-exec-parallel/sample-2.json

exec-parallel

3

json/bfcl-v3-exec-parallel/sample-3.json

exec-parallel

3

json/bfcl-v3-exec-parallel/sample-4.json

exec-parallel

3

50 rows x 2 columns

bfcl-v3-exec-parallel-multiple

filewise

topic, turns

file

topic

turns

json/bfcl-v3-exec-parallel-multiple/sample-0.json

exec-parallel-multiple

3

json/bfcl-v3-exec-parallel-multiple/sample-1.json

exec-parallel-multiple

3

json/bfcl-v3-exec-parallel-multiple/sample-2.json

exec-parallel-multiple

3

json/bfcl-v3-exec-parallel-multiple/sample-3.json

exec-parallel-multiple

3

json/bfcl-v3-exec-parallel-multiple/sample-4.json

exec-parallel-multiple

3

40 rows x 2 columns

bfcl-v3-exec-simple

filewise

topic, turns

file

topic

turns

json/bfcl-v3-exec-simple/sample-0.json

exec-simple

3

json/bfcl-v3-exec-simple/sample-1.json

exec-simple

3

json/bfcl-v3-exec-simple/sample-2.json

exec-simple

3

json/bfcl-v3-exec-simple/sample-3.json

exec-simple

3

json/bfcl-v3-exec-simple/sample-4.json

exec-simple

3

100 rows x 2 columns

bfcl-v3-irrelevance

filewise

topic, turns

file

topic

turns

json/bfcl-v3-irrelevance/sample-0.json

irrelevance

2

json/bfcl-v3-irrelevance/sample-1.json

irrelevance

2

json/bfcl-v3-irrelevance/sample-2.json

irrelevance

2

json/bfcl-v3-irrelevance/sample-3.json

irrelevance

2

json/bfcl-v3-irrelevance/sample-4.json

irrelevance

2

240 rows x 2 columns

bfcl-v3-java

filewise

topic, turns

file

topic

turns

json/bfcl-v3-java/sample-0.json

java

3

json/bfcl-v3-java/sample-1.json

java

3

json/bfcl-v3-java/sample-2.json

java

3

json/bfcl-v3-java/sample-3.json

java

3

json/bfcl-v3-java/sample-4.json

java

3

100 rows x 2 columns

bfcl-v3-javascript

filewise

topic, turns

file

topic

turns

json/bfcl-v3-javascript/sample-0.json

javascript

3

json/bfcl-v3-javascript/sample-1.json

javascript

3

json/bfcl-v3-javascript/sample-2.json

javascript

3

json/bfcl-v3-javascript/sample-3.json

javascript

3

json/bfcl-v3-javascript/sample-4.json

javascript

3

50 rows x 2 columns

bfcl-v3-live-irrelevance

filewise

topic, turns

file

topic

turns

json/bfcl-v3-live-irrelevance/sample-0.json

live-irrelevance

2

json/bfcl-v3-live-irrelevance/sample-1.json

live-irrelevance

2

json/bfcl-v3-live-irrelevance/sample-2.json

live-irrelevance

2

json/bfcl-v3-live-irrelevance/sample-3.json

live-irrelevance

2

json/bfcl-v3-live-irrelevance/sample-4.json

live-irrelevance

2

882 rows x 2 columns

bfcl-v3-live-multiple

filewise

topic, turns

file

topic

turns

json/bfcl-v3-live-multiple/sample-0.json

live-multiple

3

json/bfcl-v3-live-multiple/sample-1.json

live-multiple

4

json/bfcl-v3-live-multiple/sample-2.json

live-multiple

3

json/bfcl-v3-live-multiple/sample-3.json

live-multiple

3

json/bfcl-v3-live-multiple/sample-4.json

live-multiple

3

1053 rows x 2 columns

bfcl-v3-live-parallel

filewise

topic, turns

file

topic

turns

json/bfcl-v3-live-parallel/sample-0.json

live-parallel

3

json/bfcl-v3-live-parallel/sample-1.json

live-parallel

3

json/bfcl-v3-live-parallel/sample-2.json

live-parallel

3

json/bfcl-v3-live-parallel/sample-3.json

live-parallel

4

json/bfcl-v3-live-parallel/sample-4.json

live-parallel

3

16 rows x 2 columns

bfcl-v3-live-parallel-multiple

filewise

topic, turns

file

topic

turns

json/bfcl-v3-live-parallel-multiple/sample-0.json

live-parallel-multiple

3

json/bfcl-v3-live-parallel-multiple/sample-1.json

live-parallel-multiple

3

json/bfcl-v3-live-parallel-multiple/sample-2.json

live-parallel-multiple

3

json/bfcl-v3-live-parallel-multiple/sample-3.json

live-parallel-multiple

3

json/bfcl-v3-live-parallel-multiple/sample-4.json

live-parallel-multiple

3

24 rows x 2 columns

bfcl-v3-live-relevance

filewise

topic, turns

file

topic

turns

json/bfcl-v3-live-relevance/sample-0.json

live-relevance

2

json/bfcl-v3-live-relevance/sample-1.json

live-relevance

2

json/bfcl-v3-live-relevance/sample-2.json

live-relevance

2

json/bfcl-v3-live-relevance/sample-3.json

live-relevance

2

json/bfcl-v3-live-relevance/sample-4.json

live-relevance

2

18 rows x 2 columns

bfcl-v3-live-simple

filewise

topic, turns

file

topic

turns

json/bfcl-v3-live-simple/sample-0.json

live-simple

3

json/bfcl-v3-live-simple/sample-1.json

live-simple

3

json/bfcl-v3-live-simple/sample-2.json

live-simple

3

json/bfcl-v3-live-simple/sample-3.json

live-simple

3

json/bfcl-v3-live-simple/sample-4.json

live-simple

3

258 rows x 2 columns

bfcl-v3-multi-turn-base

filewise

topic, turns

file

topic

turns

json/bfcl-v3-multi-turn-base/sample-0.json

multi-turn-base

10

json/bfcl-v3-multi-turn-base/sample-1.json

multi-turn-base

10

json/bfcl-v3-multi-turn-base/sample-2.json

multi-turn-base

12

json/bfcl-v3-multi-turn-base/sample-3.json

multi-turn-base

6

json/bfcl-v3-multi-turn-base/sample-4.json

multi-turn-base

8

200 rows x 2 columns

bfcl-v3-multi-turn-composite

filewise

topic, turns

file

topic

turns

json/bfcl-v3-multi-turn-composite/sample-0.json

multi-turn-composite

12

json/bfcl-v3-multi-turn-composite/sample-1.json

multi-turn-composite

12

json/bfcl-v3-multi-turn-composite/sample-2.json

multi-turn-composite

13

json/bfcl-v3-multi-turn-composite/sample-3.json

multi-turn-composite

8

json/bfcl-v3-multi-turn-composite/sample-4.json

multi-turn-composite

10

200 rows x 2 columns

bfcl-v3-multi-turn-long-context

filewise

topic, turns

file

topic

turns

json/bfcl-v3-multi-turn-long-context/sample-0.json

multi-turn-long-context

10

json/bfcl-v3-multi-turn-long-context/sample-1.json

multi-turn-long-context

10

json/bfcl-v3-multi-turn-long-context/sample-2.json

multi-turn-long-context

12

json/bfcl-v3-multi-turn-long-context/sample-3.json

multi-turn-long-context

6

json/bfcl-v3-multi-turn-long-context/sample-4.json

multi-turn-long-context

8

200 rows x 2 columns

bfcl-v3-multi-turn-miss-func

filewise

topic, turns

file

topic

turns

json/bfcl-v3-multi-turn-miss-func/sample-0.json

multi-turn-miss-func

11

json/bfcl-v3-multi-turn-miss-func/sample-1.json

multi-turn-miss-func

11

json/bfcl-v3-multi-turn-miss-func/sample-2.json

multi-turn-miss-func

13

json/bfcl-v3-multi-turn-miss-func/sample-3.json

multi-turn-miss-func

7

json/bfcl-v3-multi-turn-miss-func/sample-4.json

multi-turn-miss-func

9

200 rows x 2 columns

bfcl-v3-multi-turn-miss-param

filewise

topic, turns

file

topic

turns

json/bfcl-v3-multi-turn-miss-param/sample-0.json

multi-turn-miss-param

11

json/bfcl-v3-multi-turn-miss-param/sample-1.json

multi-turn-miss-param

11

json/bfcl-v3-multi-turn-miss-param/sample-2.json

multi-turn-miss-param

13

json/bfcl-v3-multi-turn-miss-param/sample-3.json

multi-turn-miss-param

7

json/bfcl-v3-multi-turn-miss-param/sample-4.json

multi-turn-miss-param

9

200 rows x 2 columns

bfcl-v3-multiple

filewise

topic, turns

file

topic

turns

json/bfcl-v3-multiple/sample-0.json

multiple

3

json/bfcl-v3-multiple/sample-1.json

multiple

3

json/bfcl-v3-multiple/sample-2.json

multiple

3

json/bfcl-v3-multiple/sample-3.json

multiple

3

json/bfcl-v3-multiple/sample-4.json

multiple

3

200 rows x 2 columns

bfcl-v3-parallel

filewise

topic, turns

file

topic

turns

json/bfcl-v3-parallel/sample-0.json

parallel

3

json/bfcl-v3-parallel/sample-1.json

parallel

3

json/bfcl-v3-parallel/sample-2.json

parallel

3

json/bfcl-v3-parallel/sample-3.json

parallel

3

json/bfcl-v3-parallel/sample-4.json

parallel

3

200 rows x 2 columns

bfcl-v3-parallel-multiple

filewise

topic, turns

file

topic

turns

json/bfcl-v3-parallel-multiple/sample-0.json

parallel-multiple

3

json/bfcl-v3-parallel-multiple/sample-1.json

parallel-multiple

3

json/bfcl-v3-parallel-multiple/sample-2.json

parallel-multiple

3

json/bfcl-v3-parallel-multiple/sample-3.json

parallel-multiple

3

json/bfcl-v3-parallel-multiple/sample-4.json

parallel-multiple

3

200 rows x 2 columns

bfcl-v3-rest

filewise

topic, turns

file

topic

turns

json/bfcl-v3-rest/sample-0.json

rest

2

json/bfcl-v3-rest/sample-1.json

rest

2

json/bfcl-v3-rest/sample-2.json

rest

2

json/bfcl-v3-rest/sample-3.json

rest

2

json/bfcl-v3-rest/sample-4.json

rest

2

70 rows x 2 columns

bfcl-v3-simple

filewise

topic, turns

file

topic

turns

json/bfcl-v3-simple/sample-0.json

simple

3

json/bfcl-v3-simple/sample-1.json

simple

3

json/bfcl-v3-simple/sample-2.json

simple

3

json/bfcl-v3-simple/sample-3.json

simple

3

json/bfcl-v3-simple/sample-4.json

simple

3

400 rows x 2 columns

bfcl-v3-sql

filewise

topic, turns

file

topic

turns

json/bfcl-v3-sql/sample-0.json

sql

3

json/bfcl-v3-sql/sample-1.json

sql

3

json/bfcl-v3-sql/sample-2.json

sql

3

json/bfcl-v3-sql/sample-3.json

sql

3

json/bfcl-v3-sql/sample-4.json

sql

3

100 rows x 2 columns

Schemes

ID

Dtype

topic

str

turns

int