Skip to content

Commit

Permalink
Merge pull request #5022 from oobabooga/dev
Browse files Browse the repository at this point in the history
Merge dev branch
  • Loading branch information
oobabooga authored Dec 20, 2023
2 parents 489f4a2 + 6efbe30 commit 11288d1
Show file tree
Hide file tree
Showing 12 changed files with 22 additions and 35 deletions.
5 changes: 4 additions & 1 deletion modules/LoRA.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,10 @@ def add_lora_exllama(lora_names):

lora_path = get_lora_path(lora_names[0])
lora_config_path = lora_path / "adapter_config.json"
lora_adapter_path = lora_path / "adapter_model.bin"
for file_name in ["adapter_model.safetensors", "adapter_model.bin"]:
file_path = lora_path / file_name
if file_path.is_file():
lora_adapter_path = file_path

logger.info("Applying the following LoRAs to {}: {}".format(shared.model_name, ', '.join([lora_names[0]])))
if shared.model.__class__.__name__ == 'ExllamaModel':
Expand Down
35 changes: 7 additions & 28 deletions modules/exllamav2.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import random
import traceback
from pathlib import Path

Expand All @@ -10,7 +9,7 @@
ExLlamaV2Config,
ExLlamaV2Tokenizer
)
from exllamav2.generator import ExLlamaV2BaseGenerator, ExLlamaV2Sampler
from exllamav2.generator import ExLlamaV2Sampler, ExLlamaV2StreamingGenerator

from modules import shared
from modules.logging_colors import logger
Expand Down Expand Up @@ -64,7 +63,7 @@ def from_pretrained(self, path_to_model):
else:
cache = ExLlamaV2Cache(model)

generator = ExLlamaV2BaseGenerator(model, cache, tokenizer)
generator = ExLlamaV2StreamingGenerator(model, cache, tokenizer)

result = self()
result.model = model
Expand Down Expand Up @@ -115,41 +114,21 @@ def generate_with_streaming(self, prompt, state):

ids = self.tokenizer.encode(prompt, add_bos=state['add_bos_token'], encode_special_tokens=True)
ids = ids[:, -get_max_prompt_length(state):]
initial_len = ids.shape[-1]

if state['auto_max_new_tokens']:
max_new_tokens = state['truncation_length'] - ids.shape[-1]
else:
max_new_tokens = state['max_new_tokens']

# _gen_begin_base
self.cache.current_seq_len = 0
self.model.forward(ids[:, :-1], self.cache, input_mask=None, preprocess_only=True, loras=self.loras)
self.generator.begin_stream(ids, settings, loras=self.loras)

has_leading_space = False
decoded_text = ''
for i in range(max_new_tokens):
logits = self.model.forward(ids[:, -1:], self.cache, input_mask=None, loras=self.loras).float().cpu()
token, _, _ = ExLlamaV2Sampler.sample(logits, settings, ids, random.random(), self.tokenizer)
ids = torch.cat([ids, token], dim=1)

if i == 0 and self.tokenizer.tokenizer.id_to_piece(int(token)).startswith('▁'):
has_leading_space = True

decoded_text = self.tokenizer.decode(ids[:, initial_len:], decode_special_tokens=not state['skip_special_tokens'])[0]
if has_leading_space:
decoded_text = ' ' + decoded_text

# Check the partial unicode character
if chr(0xfffd) in decoded_text:
is_last = i == max_new_tokens - 1
is_stopping = token.item() == self.tokenizer.eos_token_id or shared.stop_everything
# If we are not at the end of the generation, we skip this token
if not (is_last or is_stopping):
continue

if token.item() == self.tokenizer.eos_token_id or shared.stop_everything:
chunk, eos, _ = self.generator.stream()
if eos or shared.stop_everything:
break

decoded_text += chunk
yield decoded_text

def generate(self, prompt, state):
Expand Down
8 changes: 2 additions & 6 deletions modules/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,12 +413,8 @@ def ExLlamav2_HF_loader(model_name):


def HQQ_loader(model_name):
try:
from hqq.core.quantize import HQQBackend, HQQLinear
from hqq.engine.hf import HQQModelForCausalLM
except ModuleNotFoundError:
logger.error("HQQ is not installed. You can install it with:\n\npip install hqq")
return None
from hqq.core.quantize import HQQBackend, HQQLinear
from hqq.engine.hf import HQQModelForCausalLM

logger.info(f"Loading HQQ model with backend: {shared.args.hqq_backend}")

Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ datasets
einops
exllamav2==0.0.11; platform_system != "Darwin" and platform_machine != "x86_64"
gradio==3.50.*
hqq==0.1.1.post1
markdown
numpy==1.24.*
optimum==1.16.*
Expand Down
1 change: 1 addition & 0 deletions requirements_amd.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ datasets
einops
exllamav2==0.0.11; platform_system == "Windows" or python_version < "3.10" or python_version > "3.11" or platform_machine != "x86_64"
gradio==3.50.*
hqq==0.1.1.post1
markdown
numpy==1.24.*
optimum==1.16.*
Expand Down
1 change: 1 addition & 0 deletions requirements_amd_noavx2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ datasets
einops
exllamav2==0.0.11; platform_system == "Windows" or python_version < "3.10" or python_version > "3.11" or platform_machine != "x86_64"
gradio==3.50.*
hqq==0.1.1.post1
markdown
numpy==1.24.*
optimum==1.16.*
Expand Down
1 change: 1 addition & 0 deletions requirements_apple_intel.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ datasets
einops
exllamav2==0.0.11
gradio==3.50.*
hqq==0.1.1.post1
markdown
numpy==1.24.*
optimum==1.16.*
Expand Down
1 change: 1 addition & 0 deletions requirements_apple_silicon.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ datasets
einops
exllamav2==0.0.11
gradio==3.50.*
hqq==0.1.1.post1
markdown
numpy==1.24.*
optimum==1.16.*
Expand Down
1 change: 1 addition & 0 deletions requirements_cpu_only.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ datasets
einops
exllamav2==0.0.11
gradio==3.50.*
hqq==0.1.1.post1
markdown
numpy==1.24.*
optimum==1.16.*
Expand Down
1 change: 1 addition & 0 deletions requirements_cpu_only_noavx2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ datasets
einops
exllamav2==0.0.11
gradio==3.50.*
hqq==0.1.1.post1
markdown
numpy==1.24.*
optimum==1.16.*
Expand Down
1 change: 1 addition & 0 deletions requirements_noavx2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ datasets
einops
exllamav2==0.0.11; platform_system != "Darwin" and platform_machine != "x86_64"
gradio==3.50.*
hqq==0.1.1.post1
markdown
numpy==1.24.*
optimum==1.16.*
Expand Down
1 change: 1 addition & 0 deletions requirements_nowheels.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ datasets
einops
exllamav2==0.0.11
gradio==3.50.*
hqq==0.1.1.post1
markdown
numpy==1.24.*
optimum==1.16.*
Expand Down

0 comments on commit 11288d1

Please sign in to comment.