Spaces:
Runtime error
Runtime error
| distilabel: | |||
| version: 1.0.1 | |||
| pipeline: | |||
| name: farming | |||
| description: null | |||
| steps: | |||
| - step: | |||
| name: load_data | |||
| input_mappings: {} | |||
| output_mappings: {} | |||
| batch_size: 64 | |||
| data: | |||
| - input: punctures from a Retro bikes perspective | |||
| runtime_parameters_info: | |||
| - name: batch_size | |||
| optional: true | |||
| description: The number of rows that will contain the batches generated by | |||
| the step. | |||
| type_info: | |||
| module: distilabel.steps.generators.data | |||
| name: LoadDataFromDicts | |||
| name: load_data | |||
| - step: | |||
| name: self-instruct | |||
| input_mappings: {} | |||
| output_mappings: {} | |||
| input_batch_size: 8 | |||
| llm: | |||
| generation_kwargs: {} | |||
| model_id: null | |||
| endpoint_name: null | |||
| endpoint_namespace: null | |||
| base_url: /static-proxy?url=https%3A%2F%2Fapi-inference.huggingface.co%2Fmodels%2FHuggingFaceH4%2Fzephyr-7b-beta%3C%2Fspan%3E%3C!----%3E%3C%2Ftd%3E%3C%2Ftr%3E%3Ctr id="L34"> | tokenizer_id: null | ||
| model_display_name: null | |||
| use_openai_client: false | |||
| type_info: | |||
| module: distilabel.llms.huggingface.inference_endpoints | |||
| name: InferenceEndpointsLLM | |||
| group_generations: false | |||
| num_generations: 1 | |||
| num_instructions: 5 | |||
| criteria_for_query_generation: 'Incorporate a diverse range of verbs, avoiding | |||
| repetition. | |||
| Ensure queries are compatible with AI model''s text generation functions and | |||
| are limited to 1-2 sentences. | |||
| Design queries to be self-contained and standalone. | |||
| Blend interrogative (e.g., "What is the significance of x?") and imperative | |||
| (e.g., "Detail the process of x.") styles.' | |||
| application_description: 'You are an AI assistant than generates queries around | |||
| the domain of Bicycle maintenance. | |||
| Your should not expect basic but profound questions from your users. | |||
| The queries should reflect a diversity of vision and economic positions and | |||
| political positions. | |||
| The queries may know about different methods of Bicycle maintenance. | |||
| The queries can be positioned politically, economically, socially, or practically. | |||
| Also take into account the impact of diverse causes on diverse domains.' | |||
| runtime_parameters_info: | |||
| - name: input_batch_size | |||
| optional: true | |||
| description: The number of rows that will contain the batches processed by | |||
| the step. | |||
| - name: llm | |||
| runtime_parameters_info: | |||
| - name: generation_kwargs | |||
| description: The kwargs to be propagated to either `generate` or `agenerate` | |||
| methods within each `LLM`. | |||
| keys: | |||
| - name: max_new_tokens | |||
| optional: true | |||
| description: the maximum number of new tokens that the model will generate. Defaults | |||
| to `128`. | |||
| - name: frequency_penalty | |||
| optional: true | |||
| description: the repetition penalty to use for the generation. Defaults to | |||
| `0.0`. Only applies if `use_openai_client=True`. | |||
| - name: presence_penalty | |||
| optional: true | |||
| description: the presence penalty to use for the generation. Defaults | |||
| to `0.0`. Only applies if `use_openai_client=True`. | |||
| - name: repetition_penalty | |||
| optional: true | |||
| description: the repetition penalty to use for the generation. Defaults to | |||
| `None`. Only applies if `use_openai_client=False`. | |||
| - name: temperature | |||
| optional: true | |||
| description: the temperature to use for the generation. Defaults to `1.0`. | |||
| - name: do_sample | |||
| optional: true | |||
| description: whether to use sampling for the generation. Defaults to `False`. Only | |||
| applies if `use_openai_client=False`. | |||
| - name: top_k | |||
| optional: true | |||
| description: the top-k value to use for the generation. Defaults to `0.8`, | |||
| since neither `0.0` nor `1.0` are valid values in TGI. | |||
| - name: top_p | |||
| optional: true | |||
| description: the top-p value to use for the generation. Defaults to `1.0`. | |||
| - name: typical_p | |||
| optional: true | |||
| description: the typical-p value to use for the generation. Defaults to | |||
| `0.5`. | |||
| - name: endpoint_name | |||
| optional: true | |||
| description: The name of the Inference Endpoint to use for the LLM. | |||
| - name: endpoint_namespace | |||
| optional: true | |||
| description: The namespace of the Inference Endpoint to use for the LLM. | |||
| - name: base_url | |||
| optional: true | |||
| description: The base URL to use for the Inference Endpoints API requests. | |||
| - name: api_key | |||
| optional: true | |||
| description: The API key to authenticate the requests to the Inference Endpoints | |||
| API. | |||
| - name: num_generations | |||
| optional: true | |||
| description: The number of generations to be produced per input. | |||
| type_info: | |||
| module: distilabel.steps.tasks.self_instruct | |||
| name: SelfInstruct | |||
| name: self-instruct | |||
| - step: | |||
| name: evol_instruction_complexity | |||
| input_mappings: | |||
| instruction: question | |||
| output_mappings: {} | |||
| input_batch_size: 8 | |||
| llm: | |||
| generation_kwargs: {} | |||
| model_id: null | |||
| endpoint_name: null | |||
| endpoint_namespace: null | |||
| base_url: /static-proxy?url=https%3A%2F%2Fapi-inference.huggingface.co%2Fmodels%2FHuggingFaceH4%2Fzephyr-7b-beta%3C%2Fspan%3E%3C!----%3E%3C%2Ftd%3E%3C%2Ftr%3E%3Ctr id="L143"> | tokenizer_id: null | ||
| model_display_name: null | |||
| use_openai_client: false | |||
| type_info: | |||
| module: distilabel.llms.huggingface.inference_endpoints | |||
| name: InferenceEndpointsLLM | |||
| group_generations: false | |||
| num_generations: 1 | |||
| num_evolutions: 2 | |||
| store_evolutions: true | |||
| generate_answers: false | |||
| include_original_instruction: true | |||
| mutation_templates: | |||
| CONSTRAINTS: "I want you act as a Prompt Rewriter.\n\nYour objective is to\ | |||
| \ rewrite a given prompt into a more complex version to make those famous\ | |||
| \ AI systems (e.g., chatgpt and GPT4) a bit harder to handle.\n\nBut the\ | |||
| \ rewritten prompt must be reasonable and must be understood and responded\ | |||
| \ by humans.\n\nYour rewriting cannot omit the non-text parts such as the\ | |||
| \ table and code in #The Given Prompt#:. Also, please do not omit the input\ | |||
| \ in #The Given Prompt#.\n\nYou SHOULD complicate the given prompt using\ | |||
| \ the following method: \nPlease add one more constraints/requirements into\ | |||
| \ '#The Given Prompt#'\n\nYou should try your best not to make the #Rewritten\ | |||
| \ Prompt# become verbose, #Rewritten Prompt# can only add 10 to 20 words\ | |||
| \ into #The Given Prompt#.\n\n'#The Given Prompt#', '#Rewritten Prompt#',\ | |||
| \ 'given prompt' and 'rewritten prompt' are not allowed to appear in #Rewritten\ | |||
| \ Prompt#\n\n#The Given Prompt#:\n<PROMPT>\n#Rewritten Prompt#:\n\n" | |||
| DEEPENING: "I want you act as a Prompt Rewriter.\n\nYour objective is to rewrite\ | |||
| \ a given prompt into a more complex version to make those famous AI systems\ | |||
| \ (e.g., chatgpt and GPT4) a bit harder to handle.\n\nBut the rewritten\ | |||
| \ prompt must be reasonable and must be understood and responded by humans.\n\ | |||
| \nYour rewriting cannot omit the non-text parts such as the table and code\ | |||
| \ in #The Given Prompt#:. Also, please do not omit the input in #The Given\ | |||
| \ Prompt#.\n\nYou SHOULD complicate the given prompt using the following\ | |||
| \ method: \nIf #The Given Prompt# contains inquiries about certain issues,\ | |||
| \ the depth and breadth of the inquiry can be increased.\n\nYou should try\ | |||
| \ your best not to make the #Rewritten Prompt# become verbose, #Rewritten\ | |||
| \ Prompt# can only add 10 to 20 words into #The Given Prompt#.\n\n'#The\ | |||
| \ Given Prompt#', '#Rewritten Prompt#', 'given prompt' and 'rewritten prompt'\ | |||
| \ are not allowed to appear in #Rewritten Prompt#\n\n#The Given Prompt#:\n\ | |||
| <PROMPT>\n#Rewritten Prompt#:\n\n" | |||
| CONCRETIZING: "I want you act as a Prompt Rewriter.\n\nYour objective is to\ | |||
| \ rewrite a given prompt into a more complex version to make those famous\ | |||
| \ AI systems (e.g., chatgpt and GPT4) a bit harder to handle.\n\nBut the\ | |||
| \ rewritten prompt must be reasonable and must be understood and responded\ | |||
| \ by humans.\n\nYour rewriting cannot omit the non-text parts such as the\ | |||
| \ table and code in #The Given Prompt#:. Also, please do not omit the input\ | |||
| \ in #The Given Prompt#.\n\nYou SHOULD complicate the given prompt using\ | |||
| \ the following method: \nPlease replace general concepts with more specific\ | |||
| \ concepts.\n\nYou should try your best not to make the #Rewritten Prompt#\ | |||
| \ become verbose, #Rewritten Prompt# can only add 10 to 20 words into #The\ | |||
| \ Given Prompt#.\n\n'#The Given Prompt#', '#Rewritten Prompt#', 'given prompt'\ | |||
| \ and 'rewritten prompt' are not allowed to appear in #Rewritten Prompt#\n\ | |||
| \n#The Given Prompt#:\n<PROMPT>\n#Rewritten Prompt#:\n\n" | |||
| INCREASED_REASONING_STEPS: "I want you act as a Prompt Rewriter.\n\nYour objective\ | |||
| \ is to rewrite a given prompt into a more complex version to make those\ | |||
| \ famous AI systems (e.g., chatgpt and GPT4) a bit harder to handle.\n\n\ | |||
| But the rewritten prompt must be reasonable and must be understood and responded\ | |||
| \ by humans.\n\nYour rewriting cannot omit the non-text parts such as the\ | |||
| \ table and code in #The Given Prompt#:. Also, please do not omit the input\ | |||
| \ in #The Given Prompt#.\n\nYou SHOULD complicate the given prompt using\ | |||
| \ the following method: \nIf #The Given Prompt# can be solved with just\ | |||
| \ a few simple thinking processes, you can rewrite it to explicitly request\ | |||
| \ multiple-step reasoning.\n\nYou should try your best not to make the #Rewritten\ | |||
| \ Prompt# become verbose, #Rewritten Prompt# can only add 10 to 20 words\ | |||
| \ into #The Given Prompt#.\n\n'#The Given Prompt#', '#Rewritten Prompt#',\ | |||
| \ 'given prompt' and 'rewritten prompt' are not allowed to appear in #Rewritten\ | |||
| \ Prompt#\n\n#The Given Prompt#:\n<PROMPT>\n#Rewritten Prompt#:\n\n" | |||
| BREADTH: 'I want you act as a Prompt Creator. | |||
| Your goal is to draw inspiration from the #Given Prompt# to create a brand | |||
| new prompt. | |||
| This new prompt should belong to the same domain as the #Given Prompt# but | |||
| be even more rare. | |||
| The LENGTH and complexity of the #Created Prompt# should be similar to that | |||
| of the #Given Prompt#. | |||
| The #Created Prompt# must be reasonable and must be understood and responded | |||
| by humans. | |||
| ''#Given Prompt#'', ''#Created Prompt#'', ''given prompt'' and ''created | |||
| prompt'' are not allowed to appear in #Created Prompt# | |||
| #Given Prompt#: | |||
| <PROMPT> | |||
| #Created Prompt#: | |||
| ' | |||
| seed: 42 | |||
| runtime_parameters_info: | |||
| - name: input_batch_size | |||
| optional: true | |||
| description: The number of rows that will contain the batches processed by | |||
| the step. | |||
| - name: llm | |||
| runtime_parameters_info: | |||
| - name: generation_kwargs | |||
| description: The kwargs to be propagated to either `generate` or `agenerate` | |||
| methods within each `LLM`. | |||
| keys: | |||
| - name: max_new_tokens | |||
| optional: true | |||
| description: the maximum number of new tokens that the model will generate. Defaults | |||
| to `128`. | |||
| - name: frequency_penalty | |||
| optional: true | |||
| description: the repetition penalty to use for the generation. Defaults to | |||
| `0.0`. Only applies if `use_openai_client=True`. | |||
| - name: presence_penalty | |||
| optional: true | |||
| description: the presence penalty to use for the generation. Defaults | |||
| to `0.0`. Only applies if `use_openai_client=True`. | |||
| - name: repetition_penalty | |||
| optional: true | |||
| description: the repetition penalty to use for the generation. Defaults to | |||
| `None`. Only applies if `use_openai_client=False`. | |||
| - name: temperature | |||
| optional: true | |||
| description: the temperature to use for the generation. Defaults to `1.0`. | |||
| - name: do_sample | |||
| optional: true | |||
| description: whether to use sampling for the generation. Defaults to `False`. Only | |||
| applies if `use_openai_client=False`. | |||
| - name: top_k | |||
| optional: true | |||
| description: the top-k value to use for the generation. Defaults to `0.8`, | |||
| since neither `0.0` nor `1.0` are valid values in TGI. | |||
| - name: top_p | |||
| optional: true | |||
| description: the top-p value to use for the generation. Defaults to `1.0`. | |||
| - name: typical_p | |||
| optional: true | |||
| description: the typical-p value to use for the generation. Defaults to | |||
| `0.5`. | |||
| - name: endpoint_name | |||
| optional: true | |||
| description: The name of the Inference Endpoint to use for the LLM. | |||
| - name: endpoint_namespace | |||
| optional: true | |||
| description: The namespace of the Inference Endpoint to use for the LLM. | |||
| - name: base_url | |||
| optional: true | |||
| description: The base URL to use for the Inference Endpoints API requests. | |||
| - name: api_key | |||
| optional: true | |||
| description: The API key to authenticate the requests to the Inference Endpoints | |||
| API. | |||
| - name: num_generations | |||
| optional: true | |||
| description: The number of generations to be produced per input. | |||
| - name: seed | |||
| optional: true | |||
| description: As `numpy` is being used in order to randomly pick a mutation | |||
| method, then is nice to seed a random seed. | |||
| type_info: | |||
| module: distilabel.steps.tasks.evol_instruct.base | |||
| name: EvolInstruct | |||
| name: evol_instruction_complexity | |||
| - step: | |||
| name: expand_columns | |||
| input_mappings: {} | |||
| output_mappings: {} | |||
| input_batch_size: 50 | |||
| columns: | |||
| instructions: question | |||
| runtime_parameters_info: | |||
| - name: input_batch_size | |||
| optional: true | |||
| description: The number of rows that will contain the batches processed by | |||
| the step. | |||
| type_info: | |||
| module: distilabel.steps.expand | |||
| name: ExpandColumns | |||
| name: expand_columns | |||
| - step: | |||
| name: clean_numbered_list | |||
| input_mappings: {} | |||
| output_mappings: {} | |||
| input_batch_size: 50 | |||
| runtime_parameters_info: | |||
| - name: input_batch_size | |||
| optional: true | |||
| description: The number of rows that will contain the batches processed by | |||
| the step. | |||
| type_info: | |||
| module: domain | |||
| name: CleanNumberedList | |||
| name: clean_numbered_list | |||
| - step: | |||
| name: expand_columns_evolved | |||
| input_mappings: {} | |||
| output_mappings: {} | |||
| input_batch_size: 50 | |||
| columns: | |||
| evolved_instructions: evolved_questions | |||
| runtime_parameters_info: | |||
| - name: input_batch_size | |||
| optional: true | |||
| description: The number of rows that will contain the batches processed by | |||
| the step. | |||
| type_info: | |||
| module: distilabel.steps.expand | |||
| name: ExpandColumns | |||
| name: expand_columns_evolved | |||
| - step: | |||
| name: domain_expert | |||
| input_mappings: | |||
| instruction: evolved_questions | |||
| output_mappings: | |||
| generation: domain_expert_answer | |||
| input_batch_size: 8 | |||
| llm: | |||
| generation_kwargs: {} | |||
| model_id: null | |||
| endpoint_name: null | |||
| endpoint_namespace: null | |||
| base_url: /static-proxy?url=https%3A%2F%2Fapi-inference.huggingface.co%2Fmodels%2FHuggingFaceH4%2Fzephyr-7b-beta%3C%2Fspan%3E%3C!----%3E%3C%2Ftd%3E%3C%2Ftr%3E%3Ctr id="L370"> | tokenizer_id: null | ||
| model_display_name: null | |||
| use_openai_client: false | |||
| type_info: | |||
| module: distilabel.llms.huggingface.inference_endpoints | |||
| name: InferenceEndpointsLLM | |||
| group_generations: false | |||
| num_generations: 1 | |||
| runtime_parameters_info: | |||
| - name: input_batch_size | |||
| optional: true | |||
| description: The number of rows that will contain the batches processed by | |||
| the step. | |||
| - name: llm | |||
| runtime_parameters_info: | |||
| - name: generation_kwargs | |||
| description: The kwargs to be propagated to either `generate` or `agenerate` | |||
| methods within each `LLM`. | |||
| keys: | |||
| - name: max_new_tokens | |||
| optional: true | |||
| description: the maximum number of new tokens that the model will generate. Defaults | |||
| to `128`. | |||
| - name: frequency_penalty | |||
| optional: true | |||
| description: the repetition penalty to use for the generation. Defaults to | |||
| `0.0`. Only applies if `use_openai_client=True`. | |||
| - name: presence_penalty | |||
| optional: true | |||
| description: the presence penalty to use for the generation. Defaults | |||
| to `0.0`. Only applies if `use_openai_client=True`. | |||
| - name: repetition_penalty | |||
| optional: true | |||
| description: the repetition penalty to use for the generation. Defaults to | |||
| `None`. Only applies if `use_openai_client=False`. | |||
| - name: temperature | |||
| optional: true | |||
| description: the temperature to use for the generation. Defaults to `1.0`. | |||
| - name: do_sample | |||
| optional: true | |||
| description: whether to use sampling for the generation. Defaults to `False`. Only | |||
| applies if `use_openai_client=False`. | |||
| - name: top_k | |||
| optional: true | |||
| description: the top-k value to use for the generation. Defaults to `0.8`, | |||
| since neither `0.0` nor `1.0` are valid values in TGI. | |||
| - name: top_p | |||
| optional: true | |||
| description: the top-p value to use for the generation. Defaults to `1.0`. | |||
| - name: typical_p | |||
| optional: true | |||
| description: the typical-p value to use for the generation. Defaults to | |||
| `0.5`. | |||
| - name: endpoint_name | |||
| optional: true | |||
| description: The name of the Inference Endpoint to use for the LLM. | |||
| - name: endpoint_namespace | |||
| optional: true | |||
| description: The namespace of the Inference Endpoint to use for the LLM. | |||
| - name: base_url | |||
| optional: true | |||
| description: The base URL to use for the Inference Endpoints API requests. | |||
| - name: api_key | |||
| optional: true | |||
| description: The API key to authenticate the requests to the Inference Endpoints | |||
| API. | |||
| - name: num_generations | |||
| optional: true | |||
| description: The number of generations to be produced per input. | |||
| type_info: | |||
| module: domain | |||
| name: DomainExpert | |||
| name: domain_expert | |||
| - step: | |||
| name: keep_columns | |||
| input_mappings: {} | |||
| output_mappings: {} | |||
| input_batch_size: 50 | |||
| columns: | |||
| - model_name | |||
| - evolved_questions | |||
| - domain_expert_answer | |||
| runtime_parameters_info: | |||
| - name: input_batch_size | |||
| optional: true | |||
| description: The number of rows that will contain the batches processed by | |||
| the step. | |||
| type_info: | |||
| module: distilabel.steps.keep | |||
| name: KeepColumns | |||
| name: keep_columns | |||
| - step: | |||
| name: text_generation_to_argilla | |||
| input_mappings: | |||
| instruction: evolved_questions | |||
| generation: domain_expert_answer | |||
| output_mappings: {} | |||
| input_batch_size: 50 | |||
| dataset_name: bicycle_maintenance | |||
| dataset_workspace: admin | |||
| api_url: https://burtenshaw-bicycle-maintenance-argilla-space.hf.space | |||
| runtime_parameters_info: | |||
| - name: input_batch_size | |||
| optional: true | |||
| description: The number of rows that will contain the batches processed by | |||
| the step. | |||
| - name: dataset_name | |||
| optional: false | |||
| description: The name of the dataset in Argilla. | |||
| - name: dataset_workspace | |||
| optional: true | |||
| description: The workspace where the dataset will be created in Argilla. Defaultsto | |||
| `None` which means it will be created in the default workspace. | |||
| - name: api_url | |||
| optional: true | |||
| description: The base URL to use for the Argilla API requests. | |||
| - name: api_key | |||
| optional: true | |||
| description: The API key to authenticate the requests to the Argilla API. | |||
| type_info: | |||
| module: distilabel.steps.argilla.text_generation | |||
| name: TextGenerationToArgilla | |||
| name: text_generation_to_argilla | |||
| connections: | |||
| - from: load_data | |||
| to: | |||
| - self-instruct | |||
| - from: self-instruct | |||
| to: | |||
| - expand_columns | |||
| - from: evol_instruction_complexity | |||
| to: | |||
| - expand_columns_evolved | |||
| - from: expand_columns | |||
| to: | |||
| - clean_numbered_list | |||
| - from: clean_numbered_list | |||
| to: | |||
| - evol_instruction_complexity | |||
| - from: expand_columns_evolved | |||
| to: | |||
| - domain_expert | |||
| - from: domain_expert | |||
| to: | |||
| - keep_columns | |||
| - from: keep_columns | |||
| to: | |||
| - text_generation_to_argilla | |||
| - from: text_generation_to_argilla | |||
| to: [] | |||
| type_info: | |||
| module: distilabel.pipeline.local | |||
| name: Pipeline | |||