diff --git a/Llama/Dockerfile.tpl b/Llama/Dockerfile.tpl index 95d80a689b..475bfe629f 100644 --- a/Llama/Dockerfile.tpl +++ b/Llama/Dockerfile.tpl @@ -18,7 +18,7 @@ RUN pip install --no-cache-dir transformers # Install acceletate RUN pip install accelerate -# Make port 80 available to the world outside this container +# Make port 8547 available to the world outside this container EXPOSE 8547 # Run app.py when the container launches diff --git a/Llama/Models/README.md b/Llama/Models/README.md index b6e7402be9..8d1119e141 100644 --- a/Llama/Models/README.md +++ b/Llama/Models/README.md @@ -1 +1,14 @@ -Keep all llama models here for docker build. \ No newline at end of file +Keep all llama models here for docker build. + +# Downloading Model from Hugging Face + +Please make sure you have git lfs installed before cloning the model. + +```bash +git lfs install +``` + +```bash +# Here we are downloading the Meta-Llama-3-8B-Instruct model +git clone https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct +``` diff --git a/Llama/app.py b/Llama/app.py index 639f376ac6..882bd72a15 100644 --- a/Llama/app.py +++ b/Llama/app.py @@ -9,20 +9,17 @@ from pydantic import BaseModel class Prompt(BaseModel): prompt: str -model_path = "./Models/Llama-2-7b-chat-hf" +model_id = "meta-llama/Meta-Llama-3-8B-Instruct" -tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True) pipeline = transformers.pipeline( "text-generation", - model=model_path, - # torch_dtype=torch.float32, # for CPU - torch_dtype=torch.float16, # for GPU + model=model_id, + model_kwargs={"torch_dtype": torch.bfloat16}, device_map="auto", ) app = FastAPI() - @app.post("/prompt/") async def create_item(prompt: Prompt): @@ -30,22 +27,29 @@ async def create_item(prompt: Prompt): if not prompt: return {"error": "Prompt is required"} - sequences = pipeline( - prompt.prompt, + messages = [ + {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"}, + {"role": "user", "content": "Who are you?"}, + ] + + terminators = [ + pipeline.tokenizer.eos_token_id, + pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>") + ] + + outputs = pipeline( + messages, + max_new_tokens=256, + eos_token_id=terminators, do_sample=True, - top_k=10, - num_return_sequences=1, - eos_token_id=tokenizer.eos_token_id, - max_length=200, + temperature=0.6, + top_p=0.9, ) + - prompt_response_array = [] - - for seq in sequences: - print(f"Result: {seq['generated_text']}") - prompt_response_array.append(seq["generated_text"]) + output = outputs[0]["generated_text"][-1] # return prompt response - return {"response": prompt_response_array} + return {"response": output} diff --git a/docker-compose.base.yml b/docker-compose.base.yml index c2ba5bdcef..386b5420a7 100644 --- a/docker-compose.base.yml +++ b/docker-compose.base.yml @@ -176,7 +176,19 @@ services: driver: "local" options: max-size: "1000m" - + + + llama: + networks: + - oneuptime + restart: always + environment: + <<: *common-server-variables + PORT: 8547 + logging: + driver: "local" + options: + max-size: "1000m" admin-dashboard: networks: diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml index a4948f62df..d6b74510e0 100644 --- a/docker-compose.dev.yml +++ b/docker-compose.dev.yml @@ -12,7 +12,16 @@ services: context: . dockerfile: ./Haraka/Dockerfile - + + llama: + extends: + file: ./docker-compose.base.yml + service: llama + build: + network: host + context: . + dockerfile: ./Llama/Dockerfile + redis: ports: - '6310:6379'