diff --git a/docs/source/en/model_doc/vipllava.md b/docs/source/en/model_doc/vipllava.md index c5f3c5f55f2c56..35f2467486a895 100644 --- a/docs/source/en/model_doc/vipllava.md +++ b/docs/source/en/model_doc/vipllava.md @@ -37,13 +37,13 @@ Tips: - For better results, we recommend users to prompt the model with the correct prompt format: ```bash -"USER: \nASSISTANT:" +A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: \n###Assistant: ``` For multiple turns conversation: ```bash -"USER: \nASSISTANT: USER: ASSISTANT: USER: ASSISTANT:" +A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: \n###Assistant: ###Human: ###Assistant: ``` The original code can be found [here](https://github.com/mu-cai/ViP-LLaVA). diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py index f9b1d5f3c93a8c..1ccabd754f9084 100644 --- a/src/transformers/models/vipllava/modeling_vipllava.py +++ b/src/transformers/models/vipllava/modeling_vipllava.py @@ -367,23 +367,26 @@ def forward( Example: ```python + >>> import torch >>> from PIL import Image >>> import requests >>> from transformers import AutoProcessor, VipLlavaForConditionalGeneration - >>> model = VipLlavaForConditionalGeneration.from_pretrained("llava-hf/vipllava-7b-hf") - >>> processor = AutoProcessor.from_pretrained("llava-hf/vipllava-7b-hf") + >>> model = VipLlavaForConditionalGeneration.from_pretrained("llava-hf/vip-llava-7b-hf", device_map="auto", torch_dtype=torch.float16) + >>> processor = AutoProcessor.from_pretrained("llava-hf/vip-llava-7b-hf") - >>> prompt = "USER: \nCan you please describe this image?\nASSISTANT:" + >>> prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: \n{}###Assistant:" + >>> question = "Can you please describe this image?" + >>> prompt = prompt.format(question) >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png" >>> image = Image.open(requests.get(url, stream=True).raw) - >>> inputs = processor(text=text, images=image, return_tensors="pt") + >>> inputs = processor(text=text, images=image, return_tensors="pt").to(0, torch.float16) >>> # Generate >>> generate_ids = model.generate(**inputs, max_new_tokens=20) - >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] - "USER: \nCan you please describe this image?\nASSISTANT: The image features a brown and white cat sitting on a green surface, with a red ball in its paw." + >>> processor.decode(generate_ids[0][len(inputs["input_ids"][0]):], skip_special_tokens=True) + The image features a brown and white cat sitting on a green surface, with a red ball in its ```""" output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions