diff --git a/.github/workflows/lmql-ci.yml b/.github/workflows/lmql-ci.yml
deleted file mode 100644
index 1a7900d8..00000000
--- a/.github/workflows/lmql-ci.yml
+++ /dev/null
@@ -1,68 +0,0 @@
-name: Run Tests and Publish Wheel
-
-on:
-  # on release publish
-  release:
-    types: [released]
-
-
-jobs:
-    test-without-hf-transformers:
-      runs-on: lmql-ci
-      steps:
-          - uses: actions/checkout@v3
-          - name: Setup Fresh Virtual Environment
-            run: |
-              pip install --upgrade pip
-              python3.10 -m venv env
-              export PATH=$PATH:/home/docker/.local/bin
-              source env/bin/activate
-              echo "VIRTUAL ENV:" $VIRTUAL_ENV
-          - name: Install Dependencies
-            run: source env/bin/activate && pip install -e . && pip install langchain
-          - name: Greet
-            env:
-              OPENAI_API_KEY: ${{ secrets.LMQL_CI_OPENAI_KEY }}
-            run: source env/bin/activate && python -m lmql.cli hello openai
-    test-with-hf-transformers:
-        runs-on: lmql-ci
-        needs: [test-without-hf-transformers]
-        steps:
-            - uses: actions/checkout@v3
-            - name: Setup Fresh Virtual Environment
-              run: |
-                pip install --upgrade pip
-                python3.10 -m venv env
-                export PATH=$PATH:/home/docker/.local/bin
-                source env/bin/activate
-                echo "VIRTUAL ENV:" $VIRTUAL_ENV
-            - name: Install Dependencies
-              run: source env/bin/activate && pip install -e '.[hf,tests]' && pip install langchain
-            - name: Run Tests
-              env:
-                OPENAI_API_KEY: ${{ secrets.LMQL_CI_OPENAI_KEY }}
-              run: source env/bin/activate && python src/lmql/tests/all.py --failearly
-    publish:
-      runs-on: lmql-ci
-      needs: [test-with-hf-transformers, test-without-hf-transformers]
-      steps:
-          - uses: actions/checkout@v3
-          - name: Setup Fresh Virtual Environment
-            run: |
-              pip install --upgrade pip
-              python3.10 -m venv env
-              export PATH=$PATH:/home/docker/.local/bin
-              source env/bin/activate
-              echo "VIRTUAL ENV:" $VIRTUAL_ENV  
-          - name: Install Packaging Dependencies
-            run: pip install build twine
-          - name: Package
-            env:
-              VERSION: ${{ github.ref }}
-            run: bash scripts/wheel.sh $(echo $VERSION | sed 's/^refs\/tags\/v//')
-          - name: Publish
-            env:
-              TWINE_USERNAME: __token__
-              TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
-              VERSION: ${{ github.ref }}
-            run: bash scripts/pypi-release.sh lmql-$(echo $VERSION | sed 's/^refs\/tags\/v//') --production
\ No newline at end of file
diff --git a/.github/workflows/lmql-optional-tests.yml b/.github/workflows/lmql-optional-tests.yml
new file mode 100644
index 00000000..d6ecce9b
--- /dev/null
+++ b/.github/workflows/lmql-optional-tests.yml
@@ -0,0 +1,31 @@
+name: Run Optional Tests
+
+# optional tests only run manually
+on: workflow_dispatch
+
+jobs:
+    optional-tests:
+      runs-on: lmql-ci
+      steps:
+          - uses: actions/checkout@v3
+          - name: Setup Fresh Virtual Environment
+            run: |
+              pip install --upgrade pip
+              python3.10 -m venv env
+              export PATH=$PATH:/home/docker/.local/bin
+              source env/bin/activate
+              echo "VIRTUAL ENV:" $VIRTUAL_ENV
+          # cache dependencies
+          - name: Cache dependencies
+            id: cache
+            uses: corca-ai/local-cache@v2
+            with:
+              path: env/lib/python3.10/site-packages
+              key: ${{ runner.os }}-pip-${{ hashFiles('**/setup.cfg') }}
+              base: "/home/docker/.cache/"
+          - name: Install Dependencies
+            run: source env/bin/activate && pip install -e '.[hf,hf-accel,tests,llama]' && pip install langchain
+          - name: Run Optional Tests
+            env:
+              OPENAI_API_KEY: ${{ secrets.LMQL_CI_OPENAI_KEY }}
+            run: source env/bin/activate && python src/lmql/tests/all.py only optional openai langchain --failearly
\ No newline at end of file
diff --git a/.github/workflows/lmql-release-test.yml b/.github/workflows/lmql-release-test.yml
deleted file mode 100644
index 1702a94d..00000000
--- a/.github/workflows/lmql-release-test.yml
+++ /dev/null
@@ -1,34 +0,0 @@
-name: Test Release to test.pypi.org
-
-on: 
-  workflow_dispatch:
-    inputs:
-      version:
-        description: 'Version to publish'
-        required: true
-
-
-jobs:
-    release-test-wheel:
-      runs-on: ubuntu-latest
-      steps:
-          - uses: actions/checkout@v3
-          - name: Setup Fresh Virtual Environment
-            run: |
-              pip install --upgrade pip
-              python3.10 -m venv env
-              export PATH=$PATH:/home/docker/.local/bin
-              source env/bin/activate
-              echo "VIRTUAL ENV:" $VIRTUAL_ENV  
-          - name: Install Packaging Dependencies
-            run: pip install build twine
-          - name: Package
-            env:
-              VERSION: ${{ github.event.inputs.version }}
-            run: bash scripts/wheel.sh $(echo $VERSION | sed 's/^refs\/tags\/v//')
-          - name: Publish
-            env:
-              TWINE_USERNAME: __token__
-              TWINE_PASSWORD: ${{ secrets.TEST_PYPI_TOKEN }}
-              VERSION: ${{ github.event.inputs.version }}
-            run: bash scripts/pypi-release.sh lmql-$(echo $VERSION | sed 's/^refs\/tags\/v//')
\ No newline at end of file
diff --git a/.github/workflows/lmql-release.yml b/.github/workflows/lmql-release.yml
new file mode 100644
index 00000000..b748c06b
--- /dev/null
+++ b/.github/workflows/lmql-release.yml
@@ -0,0 +1,82 @@
+name: Test and Publish New Release
+
+on:
+  # on release publish
+  release:
+    types: [released]
+
+
+jobs:
+    tests:
+        runs-on: lmql-ci
+        steps:
+            - uses: actions/checkout@v3
+            - name: Setup Fresh Virtual Environment
+              run: |
+                pip install --upgrade pip
+                python3.10 -m venv env
+                export PATH=$PATH:/home/docker/.local/bin
+                source env/bin/activate
+                echo "VIRTUAL ENV:" $VIRTUAL_ENV
+            # cache dependencies
+            - name: Cache dependencies
+              id: cache
+              uses: corca-ai/local-cache@v2
+              with:
+                path: env
+                key: ${{ runner.os }}-env-${{ hashFiles('**/setup.cfg') }}
+                base: "/home/docker/.cache/"
+            - name: Install Dependencies
+              run: source env/bin/activate && pip install -e '.[hf,hf-accel,tests,llama]'
+            - name: Run Default Tests
+              run: source env/bin/activate && python src/lmql/tests/all.py --failearly
+    optional-tests:
+        runs-on: lmql-ci
+        
+        steps:
+            - uses: actions/checkout@v3
+            - name: Setup Fresh Virtual Environment
+              run: |
+                pip install --upgrade pip
+                python3.10 -m venv env
+                export PATH=$PATH:/home/docker/.local/bin
+                source env/bin/activate
+                echo "VIRTUAL ENV:" $VIRTUAL_ENV
+            # cache dependencies
+            - name: Cache dependencies
+              id: cache
+              uses: corca-ai/local-cache@v2
+              with:
+                path: env/lib/python3.10/site-packages
+                key: ${{ runner.os }}-pip-${{ hashFiles('**/setup.cfg') }}
+                base: "/home/docker/.cache/"
+            - name: Install Dependencies
+              run: source env/bin/activate && pip install -e '.[hf,hf-accel,tests,llama]' && pip install langchain
+            - name: Run Optional Tests
+              env:
+                OPENAI_API_KEY: ${{ secrets.LMQL_CI_OPENAI_KEY }}
+              run: source env/bin/activate && python src/lmql/tests/all.py only optional openai langchain --failearly
+    publish:
+        runs-on: lmql-ci
+        needs: [tests, optional-tests]
+        steps:
+            - uses: actions/checkout@v3
+            - name: Setup Fresh Virtual Environment
+              run: |
+                pip install --upgrade pip
+                python3.10 -m venv env
+                export PATH=$PATH:/home/docker/.local/bin
+                source env/bin/activate
+                echo "VIRTUAL ENV:" $VIRTUAL_ENV  
+            - name: Install Packaging Dependencies
+              run: pip install build twine
+            - name: Package
+              env:
+                VERSION: ${{ github.ref }}
+              run: bash scripts/wheel.sh $(echo $VERSION | sed 's/^refs\/tags\/v//')
+            - name: Publish
+              env:
+                TWINE_USERNAME: __token__
+                TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
+                VERSION: ${{ github.ref }}
+              run: bash scripts/pypi-release.sh lmql-$(echo $VERSION | sed 's/^refs\/tags\/v//') --production
\ No newline at end of file
diff --git a/.github/workflows/lmql-test-release.yml b/.github/workflows/lmql-test-release.yml
new file mode 100644
index 00000000..62a09b93
--- /dev/null
+++ b/.github/workflows/lmql-test-release.yml
@@ -0,0 +1,84 @@
+name: Test Release to test.pypi.org
+
+on:
+    workflow_dispatch:
+      inputs:
+        version:
+          description: 'Version to publish'
+          required: true
+
+
+jobs:
+    tests:
+        runs-on: lmql-ci
+        steps:
+            - uses: actions/checkout@v3
+            - name: Setup Fresh Virtual Environment
+              run: |
+                pip install --upgrade pip
+                python3.10 -m venv env
+                export PATH=$PATH:/home/docker/.local/bin
+                source env/bin/activate
+                echo "VIRTUAL ENV:" $VIRTUAL_ENV
+            # cache dependencies
+            - name: Cache dependencies
+              id: cache
+              uses: corca-ai/local-cache@v2
+              with:
+                path: env
+                key: ${{ runner.os }}-env-${{ hashFiles('**/setup.cfg') }}
+                base: "/home/docker/.cache/"
+            - name: Install Dependencies
+              run: source env/bin/activate && pip install -e '.[hf,hf-accel,tests,llama]'
+            - name: Run Default Tests
+              run: source env/bin/activate && python src/lmql/tests/all.py --failearly
+    optional-tests:
+        runs-on: lmql-ci
+        
+        steps:
+            - uses: actions/checkout@v3
+            - name: Setup Fresh Virtual Environment
+              run: |
+                pip install --upgrade pip
+                python3.10 -m venv env
+                export PATH=$PATH:/home/docker/.local/bin
+                source env/bin/activate
+                echo "VIRTUAL ENV:" $VIRTUAL_ENV
+            # cache dependencies
+            - name: Cache dependencies
+              id: cache
+              uses: corca-ai/local-cache@v2
+              with:
+                path: env/lib/python3.10/site-packages
+                key: ${{ runner.os }}-pip-${{ hashFiles('**/setup.cfg') }}
+                base: "/home/docker/.cache/"
+            - name: Install Dependencies
+              run: source env/bin/activate && pip install -e '.[hf,hf-accel,tests,llama]' && pip install langchain
+            - name: Run Optional Tests
+              env:
+                OPENAI_API_KEY: ${{ secrets.LMQL_CI_OPENAI_KEY }}
+              run: source env/bin/activate && python src/lmql/tests/all.py only optional openai langchain --failearly
+    publish:
+        runs-on: lmql-ci
+        needs: [tests, optional-tests]
+        steps:
+            - uses: actions/checkout@v3
+            - name: Setup Fresh Virtual Environment
+              run: |
+                pip install --upgrade pip
+                python3.10 -m venv env
+                export PATH=$PATH:/home/docker/.local/bin
+                source env/bin/activate
+                echo "VIRTUAL ENV:" $VIRTUAL_ENV  
+            - name: Install Packaging Dependencies
+              run: pip install build twine
+            - name: Package
+              env:
+                VERSION: ${{ github.event.inputs.version }}
+              run: bash scripts/wheel.sh $(echo $VERSION | sed 's/^refs\/tags\/v//')
+            - name: Publish
+              env:
+                TWINE_USERNAME: __token__
+                TWINE_PASSWORD: ${{ secrets.TEST_PYPI_TOKEN }}
+                VERSION: ${{ github.event.inputs.version }}
+              run: bash scripts/pypi-release.sh lmql-$(echo $VERSION | sed 's/^refs\/tags\/v//')
\ No newline at end of file
diff --git a/.github/workflows/lmql-tests.yml b/.github/workflows/lmql-tests.yml
index fd6bd3c0..057e93f8 100644
--- a/.github/workflows/lmql-tests.yml
+++ b/.github/workflows/lmql-tests.yml
@@ -1,9 +1,11 @@
 name: Run Tests
 
-on: workflow_dispatch  
+on:
+  push:
+    branches: [ main ]
 
 jobs:
-    test-without-hf-transformers:
+    tests:
       runs-on: lmql-ci
       steps:
           - uses: actions/checkout@v3
@@ -14,27 +16,87 @@ jobs:
               export PATH=$PATH:/home/docker/.local/bin
               source env/bin/activate
               echo "VIRTUAL ENV:" $VIRTUAL_ENV
+          # cache dependencies
+          - name: Cache dependencies
+            id: cache
+            uses: corca-ai/local-cache@v2
+            with:
+              path: env
+              key: ${{ runner.os }}-env-${{ hashFiles('**/setup.cfg') }}
+              base: "/home/docker/.cache/"
           - name: Install Dependencies
-            run: source env/bin/activate && pip install -e . && pip install langchain
-          - name: Greet
+            run: source env/bin/activate && pip install -e '.[hf,hf-accel,tests,llama]'
+          - name: Run Default Tests
+            run: source env/bin/activate && python src/lmql/tests/all.py --failearly
+    wheel:
+      runs-on: lmql-ci
+      needs: [tests]
+      steps:
+          - uses: actions/checkout@v3
+          - name: Setup Fresh Virtual Environment
+            run: |
+              pip install --upgrade pip
+              python3.10 -m venv env
+              export PATH=$PATH:/home/docker/.local/bin
+              source env/bin/activate
+              echo "VIRTUAL ENV:" $VIRTUAL_ENV  
+          - name: Install Packaging Dependencies
+            run: pip install build twine
+          - name: Package
             env:
-              OPENAI_API_KEY: ${{ secrets.LMQL_CI_OPENAI_KEY }}
-            run: source env/bin/activate && python -m lmql.cli hello openai
-    test-with-hf-transformers:
-        runs-on: lmql-ci
-        needs: [test-without-hf-transformers]
-        steps:
-            - uses: actions/checkout@v3
-            - name: Setup Fresh Virtual Environment
-              run: |
-                pip install --upgrade pip
-                python3.10 -m venv env
-                export PATH=$PATH:/home/docker/.local/bin
-                source env/bin/activate
-                echo "VIRTUAL ENV:" $VIRTUAL_ENV
-            - name: Install Dependencies
-              run: source env/bin/activate && pip install -e '.[hf,tests]' && pip install langchain
-            - name: Run Tests
-              env:
-                OPENAI_API_KEY: ${{ secrets.LMQL_CI_OPENAI_KEY }}
-              run: source env/bin/activate && python src/lmql/tests/all.py --failearly
\ No newline at end of file
+              VERSION: 0.999999
+            run: bash scripts/wheel.sh $VERSION
+          - name: Upload wheel as artifact
+            uses: actions/upload-artifact@v2
+            with:
+              name: lmql
+              path: dist/
+
+    web-build:
+      runs-on: ubuntu-latest
+      needs: [tests]
+  
+      steps:
+      - uses: actions/checkout@v3
+      - name: Prepare Node.js environment
+        uses: actions/setup-node@v3
+        with:
+          node-version: 20.x
+          cache: 'npm'
+      # cached dependencies for node
+      - name: Cache node modules
+        uses: actions/cache@v3
+        with:
+          path: |
+            docs/node_modules
+            scripts/browser-build/node_modules
+            src/lmql/ui/live/node_modules
+            src/lmql/ui/playground/node_modules
+          key: ${{ runner.os }}-node-${{ hashFiles('**/package-lock.json') }}
+  
+      - name: Build website and In-Browser LMQL Distribution
+        run: cd scripts && bash deploy-web.sh
+      - uses: actions/upload-artifact@master
+        with:
+          name: lmql-web-payload
+          path: web-deploy
+  
+    web-deploy:
+      name: Deploy Main Playground 
+      runs-on: ubuntu-latest
+      needs: [web-build]
+      if:
+        contains('
+          refs/heads/main
+        ', github.ref)
+      steps:
+      - uses: actions/checkout@master
+      - uses: actions/download-artifact@master
+        with:
+          name: lmql-web-payload
+          path: web-deploy
+      - name: Deploy to web branch
+        uses: JamesIves/github-pages-deploy-action@v4
+        with:
+          folder: web-deploy
+          branch: web
\ No newline at end of file
diff --git a/.github/workflows/lmql-web.yml b/.github/workflows/lmql-web.yml
deleted file mode 100644
index 9d4f6bf8..00000000
--- a/.github/workflows/lmql-web.yml
+++ /dev/null
@@ -1,56 +0,0 @@
-name: Browser Build & Web Deploy
-
-on:
-  push:
-    branches: [ "main" ]
-
-jobs:
-  build:
-
-    runs-on: ubuntu-latest
-
-    steps:
-    - uses: actions/checkout@v3
-    - name: Prepare Node.js environment
-      uses: actions/setup-node@v3
-      with:
-        node-version: 20.x
-        cache: 'npm'
-    # cached dependencies for node
-    - name: Cache node modules
-      uses: actions/cache@v3
-      with:
-        path: |
-          docs/node_modules
-          scripts/browser-build/node_modules
-          src/lmql/ui/live/node_modules
-          src/lmql/ui/playground/node_modules
-        key: ${{ runner.os }}-node-${{ hashFiles('**/package-lock.json') }}
-
-    - name: Build website and In-Browser LMQL Distribution
-      run: cd scripts && bash deploy-web.sh
-    - uses: actions/upload-artifact@master
-      with:
-        name: lmql-web-payload
-        path: web-deploy
-
-  deploy-main:
-    name: Deploy Main Playground 
-    runs-on: ubuntu-latest
-    needs: [build]
-    if:
-      contains('
-        refs/heads/main
-      ', github.ref)
-    steps:
-    - uses: actions/checkout@master
-    - uses: actions/download-artifact@master
-      with:
-        name: lmql-web-payload
-        path: web-deploy
-    - name: Deploy to web branch
-      uses: JamesIves/github-pages-deploy-action@v4
-      with:
-        folder: web-deploy
-        branch: web
-
diff --git a/Dockerfile.tests b/Dockerfile.tests
new file mode 100644
index 00000000..9bb05305
--- /dev/null
+++ b/Dockerfile.tests
@@ -0,0 +1,27 @@
+FROM python:3.11-bullseye
+
+# install lmql with llama.cpp dependencies
+WORKDIR /lmql
+
+# download test model weights
+RUN wget https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q2_K.gguf?download=true -O /lmql/llama-2-7b-chat.Q2_K.gguf
+
+# install the torch cpu version
+RUN pip install torch --index-url https://download.pytorch.org/whl/cpu
+
+COPY setup.cfg /lmql/setup.cfg
+COPY setup.py /lmql/setup.py
+RUN mkdir /lmql/src
+
+RUN pip install -e ".[llama,hf,hf-accel,tests]"
+RUN pip install -e ".[hf]"
+RUN pip install -e ".[hf,hf-accel,tests]"
+RUN pip install langchain
+
+COPY src /lmql/src
+
+# python install sshleifer/tiny-gpt2 via transformers
+RUN python -c "from transformers import AutoTokenizer, AutoModelForCausalLM; AutoTokenizer.from_pretrained('sshleifer/tiny-gpt2'); AutoModelForCausalLM.from_pretrained('sshleifer/tiny-gpt2'); AutoTokenizer.from_pretrained('gpt2'); AutoModelForCausalLM.from_pretrained('gpt2')"
+
+CMD ["python", "src/lmql/tests/all.py"]
+CMD "bash"
\ No newline at end of file
diff --git a/src/lmql/algorithms/cache.py b/src/lmql/algorithms/cache.py
index f53d9cfa..4e6a8462 100644
--- a/src/lmql/algorithms/cache.py
+++ b/src/lmql/algorithms/cache.py
@@ -82,7 +82,6 @@ async def apply(q, *args, **kwargs):
         stats["cached"] += 1
         return cache[key]
     else:
-        kwargs = {}
         try:
             result = await q(*args, **kwargs)
             if len(result) == 1:
diff --git a/src/lmql/cli.py b/src/lmql/cli.py
index b71c3c45..7d9c6e5d 100755
--- a/src/lmql/cli.py
+++ b/src/lmql/cli.py
@@ -227,7 +227,7 @@ def hello():
         asyncio.run(lmql.run(code_openai, output_writer=lmql.printing, model="openai/text-ada-001"))
 
 def basic_samples():
-    from lmql.tests.test_sample_queries import main
+    from lmql.tests.optional.openai.test_sample_queries import main
     import asyncio
     asyncio.run(main())
 
diff --git a/src/lmql/models/lmtp/lmtp_dcmodel.py b/src/lmql/models/lmtp/lmtp_dcmodel.py
index 08441440..7e6b409b 100644
--- a/src/lmql/models/lmtp/lmtp_dcmodel.py
+++ b/src/lmql/models/lmtp/lmtp_dcmodel.py
@@ -224,7 +224,7 @@ def make_logits(self, payload):
     async def singleton_result(self, token, score):
         yield {"token": token, "logprob": score, "top_logprobs": {token: score}}
 
-    async def generate(self, s, temperature, top_logprobs = 1, chunk_size=None, **kwargs):
+    async def generate(self, s, temperature, sampling_mode, top_logprobs = 1, chunk_size=None, **kwargs):
         kwargs = {**self.model_args, **kwargs}
 
         # get token masks from interpreter
@@ -245,7 +245,7 @@ async def generate(self, s, temperature, top_logprobs = 1, chunk_size=None, **kw
             num_allowed = masks.mask_num_allowed(mask)
             if num_allowed == 1:
                 only_allowed_id = masks.mask_get_only_allowed(mask)
-                return self.singleton_result(only_allowed_id, 0.0)
+                return self.stream_and_return_first(s, self.singleton_result(only_allowed_id, 0.0), sampling_mode)
             
             assert nputil.is_array(mask), "logit_mask_or_fixed_id must be a LongTensor not a " + str(type(mask))
             invert = num_allowed < self.tokenizer.vocab_size - num_allowed
@@ -270,7 +270,7 @@ async def generate(self, s, temperature, top_logprobs = 1, chunk_size=None, **kw
 
         if self.verbose:
             text = await self.detokenize(ids)
-            print("lmtp generate: {} / {} ({} tokens, temperature={}, max_tokens={})".format(ids, str([text])[1:-1], len(ids), temperature, max_tokens))
+            print("lmtp generate: {} / {} ({} tokens, temperature={}, max_tokens={})".format(ids, str([text])[1:-1], len(ids), temperature, max_tokens), flush=True)
 
         # get token stream
         token_stream = self.client.generate(ids, max_tokens=max_tokens, temperature=temperature, logit_bias=mask, top_logprobs=top_logprobs, **self.extra_decoding_parameters)
@@ -289,9 +289,9 @@ async def generate(self, s, temperature, top_logprobs = 1, chunk_size=None, **kw
                 }
             })
 
-            return self.traced_generate(token_stream, event=stream_event)
+            token_stream = self.traced_generate(token_stream, event=stream_event)
 
-        return token_stream
+        return self.stream_and_return_first(s, token_stream, sampling_mode)
 
     async def traced_generate(self, generate_iterator, event: Event):
         first = True
@@ -337,7 +337,7 @@ async def op_sample(seqs):
                 # no sample-id needed for (deterministic) top-1
                 unique_sampling_mode = [sampling_mode for _ in seqs]
 
-            tokens = await asyncio.gather(*[self.stream_and_return_first(s, await self.generate(s, temperature=temperature, **kwargs), mode) for s,mode in zip(seqs, unique_sampling_mode)])
+            tokens = await asyncio.gather(*[await self.generate(s, sampling_mode=mode, temperature=temperature, **kwargs) for s,mode in zip(seqs, unique_sampling_mode)])
 
             next_token_ids = np.array([t['token'] for t in tokens], dtype=np.int64)
             next_token_scores = np.array([t['logprob'] for t in tokens], dtype=np.float32)
@@ -365,7 +365,7 @@ async def op_topk(seqs, k):
                 return [s.make_successors(next_token_ids[i].reshape(1), next_token_scores[i], logits=None) for i,s in enumerate(seqs)]
 
             self.model.num_queries += len(seqs)
-            result = await asyncio.gather(*[self.stream_and_return_first(s, await self.generate(s, temperature=0.0, top_logprobs=k, chunk_size=None if k == 1 else 1, **kwargs), "top-1") for s in seqs])
+            result = await asyncio.gather(*[await self.generate(s, sampling_mode="top-1", temperature=0.0, top_logprobs=k, chunk_size=None if k == 1 else 1, **kwargs) for s in seqs])
 
             logits = []
             next_token_ids = []
diff --git a/src/lmql/runtime/dclib/dclib_cache.py b/src/lmql/runtime/dclib/dclib_cache.py
index 062d32b0..544f2064 100644
--- a/src/lmql/runtime/dclib/dclib_cache.py
+++ b/src/lmql/runtime/dclib/dclib_cache.py
@@ -173,6 +173,16 @@ async def get_keys(self, s: DecoderSequence, edge_type: str, **kwargs):
         if type(s) is DeterministicDecoderSequence and len(s.next_ids) > 0:
             keys.append((self.base_key(s), str(s.next_ids[0])))
 
+        def unpack(token):
+            if type(token) is np.ndarray:
+                token = token[0]
+            if type(token) is np.bytes_:
+                token = token.decode("utf-8")
+            try:
+                return int(token)
+            except:
+                return None
+
         if mask is not None:
             if masks.mask_num_allowed(mask) == 1:
                 keys.append((self.base_key(s), str(masks.mask_get_only_allowed(mask))))
@@ -182,18 +192,26 @@ async def get_keys(self, s: DecoderSequence, edge_type: str, **kwargs):
 
                     if edge_type == "top-1":
                         argmax_token, argmax_score = self.cache.get((self.base_key(s), "top-1"), (None, None))
-                        if type(argmax_token) is int and argmax_token in mask:
-                            keys.append((self.base_key(s), str(argmax_token)))
+                        argmax_token = unpack(argmax_token)
+                        print("check if", [argmax_token, masks.mask_is_allowed(mask, argmax_token)], flush=True)
+                        if type(argmax_token) is int and masks.mask_is_allowed(mask, argmax_token):
+                            keys.append((self.base_key(s), "top-1"))
                 else:
+                    if edge_type == "top-1":
+                        argmax_token, argmax_score = self.cache.get((self.base_key(s), "top-1"), (None, None))
+                        unpacked_argmax_token = unpack(argmax_token)
+                        if type(unpacked_argmax_token) is int and masks.mask_is_allowed(mask, unpacked_argmax_token):
+                            keys.append((self.base_key(s), "top-1"))
+                    
                     keys.append((self.base_key(s), edge_type, "-".join([str(i) for i in np.where(mask >= 0)[0]])))
         else:
             # standard key is sequence id + edge type
             keys.append((self.base_key(s), edge_type))
 
-        return keys
+        return keys, mask is not None
     
     async def get_cache(self, s: DecoderSequence, edge_type: str, user_data=False, **kwargs):
-        keys = await self.get_keys(s, edge_type, **kwargs)
+        keys, has_mask = await self.get_keys(s, edge_type, **kwargs)
 
         for k in keys:
             token, score = None, None
@@ -221,7 +239,7 @@ async def get_cache(self, s: DecoderSequence, edge_type: str, user_data=False, *
     def set_cache(self, key, c: Union[Continuation, tuple], user_data=None, verbose=False):
         for k in key:
             if verbose:
-                print("    cached", k)
+                print("    cached", (k[0][-20:], k[1]), c)
             
             # check if the existing entry is a future
             existing = self.cache.get(k, (None, None))[0]
@@ -252,6 +270,12 @@ async def op_argmax(seqs):
 
             # apply operation for non-cached
             non_cached = [s for s, c in zip(seqs, cached_tokens) if c is None]
+
+            # for i, keys in enumerate(cache_keys):
+            #     if cached_tokens[i] is None:
+            #         for key in keys:
+            #             print("   X non-cached", (key[0][-30:], key[1:]), non_cached[i])
+
             # generator over new results
             non_cached_argmax = iter((await self.delegate.argmax(DataArray(non_cached), **kwargs)).items())                
             
@@ -527,6 +551,7 @@ async def token_consumer(itr):
                 keys = None
                 sq = None
                 waiting_token_keys = []
+                has_mask = False
 
                 async for (s, tokens, scores, edge_types, user_data) in itr():
                     async with self.cache_lock:
@@ -546,7 +571,7 @@ async def token_consumer(itr):
 
                             if ids is None:
                                 ids = s.input_ids
-                                keys = await self.get_keys(s, edge_type, **self.model_args)
+                                keys, has_mask = await self.get_keys(s, edge_type, **self.model_args)
                                 sq = s
                             
                             token_keys = [(self.base_key(ids), edge_type, *k[2:]) for k in keys]
@@ -554,6 +579,10 @@ async def token_consumer(itr):
                             # filter out keys with edge_type=None
                             token_keys = [k for k in token_keys if k[1] is not None]
 
+                            # do not allow top-1 entries if this is a masked distribution
+                            if has_mask:
+                                token_keys = [k for k in token_keys if k[1] != "top-1" or len(k[2:]) > 0]
+
                             # for tk in token_keys:
                             #     if tk in self.cache and type(self.cache[tk][0]) is not asyncio.Future:
                             #         print("token_consumer: token for {} from stream already in cache ({} streams): {}".format(tk, len(self.token_streams), self.cache[tk]))
diff --git a/src/lmql/runtime/masks.py b/src/lmql/runtime/masks.py
index 95df8e75..81243634 100644
--- a/src/lmql/runtime/masks.py
+++ b/src/lmql/runtime/masks.py
@@ -43,4 +43,13 @@ def is_fixed_int_mask(mask):
 def to_dense(mask, vocab_size):
     dense_mask = np.ones([vocab_size]) * -np.inf
     dense_mask[mask] = 0
-    return dense_mask
\ No newline at end of file
+    return dense_mask
+
+def mask_key(mask):
+    if mask_num_allowed(mask) == 1:
+        return str(mask_get_only_allowed(mask))
+    else:
+        if is_fixed_int_mask(mask):
+            return "-".join([str(i) for i in mask])
+        else:
+            return "-".join([str(i) for i in np.where(mask >= 0)[0]])
\ No newline at end of file
diff --git a/src/lmql/tests/README.md b/src/lmql/tests/README.md
index cf129754..397e5c6c 100644
--- a/src/lmql/tests/README.md
+++ b/src/lmql/tests/README.md
@@ -3,3 +3,31 @@
 **Running Test Suites** The directory contains a number of test suites. To run all tests, execute `python src/lmql/tests/all.py`. Note that for some tests you need to configure an OpenAI API key according to the instructions in documentation. We are working to remove the external dependency on the OpenAI API, but for now it is still required for some tests.
 
 **Adding Tests** You are also invited to add new tests in the form of a new `test_*.py` file in the `src/lmql/tests/` directory. For an example of how to write tests, please see the e.g. https://github.com/eth-sri/lmql/blob/main/src/lmql/tests/test_nested_queries.py. As demonstrated by this file, also try to implement your tests using lmql.model("random", seed=<SEED>) to make sure your test code can be run without actually using an LLM or external API, and that it can be re-run deterministically.
+
+## Running Tests in Docker
+
+To run the tests in a Docker container, first build the `Dockerfile.tests` image at the project root:
+
+```bash
+docker build -f Dockerfile.tests -t lmql-tests .
+```
+
+Then run the tests in the container:
+
+```bash
+docker run -v ./src:/lmql/src lmql-tests python src/lmql/tests/all.py [langchain] [openai]
+```
+
+This command will test the current working directory, which is mounted to `/lmql/src` in the container.
+
+The `langchain` and `openai` arguments are optional and can be used to explicitly enable or disable the corresponding integration tests with the OpenAI API and LangChain. By default, both are disabled.
+
+## Test Levels and Dependencies
+
+* All files in this directory starting with `test_` are considered *default level* test modules. They are run by default when executing `python src/lmql/tests/all.py` and should not have any OpenAI API or LangChain dependencies. For *default level* tests, small `transformers` models or `llama.cpp` models are used, which can be executed fully locally without the need for an API key.
+
+    For proper functioning, some default level tests may require a quantized variant of *Llama-2 Chat 7b* at `/lmql/llama-2-7b-chat.Q2_K.gguf`, which can be downloaded from `ttps://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q2_K.gguf` and can run on a CPU-only machine.
+    
+    Run tests with the provided `Dockefile.tests` file to automatically set up the required directory structure in a Docker container.
+
+* Files in subfolders of the `optional/` directory, like `optional/openai/tests_*.py`, are considered *optional level* test modules. They are not executed by default, but can be run by executing e.g. `python src/lmql/tests/all.py openai`. They may have OpenAI API, LangChain or other optional dependencies.
\ No newline at end of file
diff --git a/src/lmql/tests/all.py b/src/lmql/tests/all.py
index 3c1e213e..3066a774 100644
--- a/src/lmql/tests/all.py
+++ b/src/lmql/tests/all.py
@@ -3,37 +3,69 @@
 import subprocess
 import lmql
 
-THIS_DIR = os.path.dirname(__file__)
-files = sorted(os.listdir(THIS_DIR))
-TEST_TIMEOUT = float(os.environ.get("TEST_TIMEOUT", 3*60.0))
-
-errors = 0 
-files = [f for f in files if f.startswith("test_")]
-
-print(f"Testing LMQL distribution {lmql.__version__} at {lmql.__file__} with {len(files)} test suites.")
-
-for i,f in enumerate(files):
-    try:
-        print(">", f"[{i+1}/{len(files)}]", f)
-
-        cmd = [sys.executable, os.path.join(THIS_DIR, f)]
-        timeout = TEST_TIMEOUT
-        result = subprocess.call(cmd, timeout=timeout)
-        
-        if result == 2:
-            raise KeyboardInterrupt
-        if result != 0:
-            errors += 1
-            if "--failearly" in sys.argv:
-                break
-    except subprocess.TimeoutExpired:
-        print(">", f"[{i+1}/{len(files)}]", f, "timed out after", timeout, "seconds")
-        sys.exit(1)
+def run_tests(directory):
+    files = sorted(os.listdir(directory))
+    TEST_TIMEOUT = float(os.environ.get("TEST_TIMEOUT", 3*60.0))
 
-    except KeyboardInterrupt:
-        sys.exit(1)
+    errors = 0 
+    files = [f for f in files if f.startswith("test_")]
+
+    print(f"Testing LMQL distribution {lmql.__version__} at {lmql.__file__} with {len(files)} in {directory}", flush=True)
+
+    for i,f in enumerate(files):
+        try:
+            print(">", f"[{i+1}/{len(files)}]", f, flush=True)
+
+            cmd = [sys.executable, os.path.join(directory, f)]
+            timeout = TEST_TIMEOUT
+            result = subprocess.call(cmd, timeout=timeout)
+            
+            if result == 2:
+                raise KeyboardInterrupt
+            if result != 0:
+                errors += 1
+                print(">", f"[{i+1}/{len(files)}]", f, "failed", flush=True)
+                if "--failearly" in sys.argv:
+                    break
+        except subprocess.TimeoutExpired:
+            print(">", f"[{i+1}/{len(files)}]", f, "timed out after", timeout, "seconds")
+            return 1
+
+        except KeyboardInterrupt:
+            return 1
+
+    if errors != 0:
+        return 1
+    else:
+        return 
 
-if errors != 0:
-    sys.exit(1)
-else:
-    sys.exit(0)
\ No newline at end of file
+if __name__ == "__main__":
+    THIS_DIR = os.path.dirname(__file__)
+
+    # if you want to run only some targets, pass them as 'only' arguments, e.g. 'python all.py only openai'
+    if "only" in sys.argv:
+        targets = []
+    else:
+        targets = [THIS_DIR]
+
+    # default is the explicit name for .
+    if "default" in sys.argv:
+        targets.append(THIS_DIR)
+    
+    include_all_optional = "optional" in sys.argv
+
+    optional_targets = os.listdir(os.path.join(THIS_DIR, "optional"))
+    optional_targets = [t for t in optional_targets if os.path.isdir(os.path.join(THIS_DIR, "optional", t)) and (t in sys.argv or include_all_optional)]
+    optional_targets = [os.path.join(THIS_DIR, "optional", t) for t in optional_targets]
+
+    targets = sorted(set(targets + optional_targets))
+
+    exit_codes = []
+
+    for t in targets:
+        exit_codes += [run_tests(t)]
+    
+    if any(exit_codes):
+        sys.exit(1)
+    else:
+        sys.exit(0)
\ No newline at end of file
diff --git a/src/lmql/tests/test_lmtp_langchain_integration.py b/src/lmql/tests/optional/langchain/test_lmtp_langchain_integration.py
similarity index 71%
rename from src/lmql/tests/test_lmtp_langchain_integration.py
rename to src/lmql/tests/optional/langchain/test_lmtp_langchain_integration.py
index 1ec8c064..7dd1783d 100644
--- a/src/lmql/tests/test_lmtp_langchain_integration.py
+++ b/src/lmql/tests/optional/langchain/test_lmtp_langchain_integration.py
@@ -35,7 +35,7 @@ def stop_lmql_serve_model(run_lmql_serve_model):
 @pytest.mark.asyncio
 async def test_do_async(test_llm):
     r = await test_llm.apredict(*ARGS, **KWARGS)
-    assert r == "available Spoon announces 1929edyame fertilizer Dipmmseasea", "apredict result is not as expected"
+    assert r == "available aliases huge millennia announcementbid continents Epstein retention Buddhism", "apredict result is not as expected"
 
 
 @pytest.mark.asyncio
@@ -46,18 +46,14 @@ async def test_do_sync_in_async(test_llm):
 
 def test_do_sync(test_llm):
     r = test_llm.predict(*ARGS, **KWARGS)
-    assert r == "available Spoon announces 1929edyame fertilizer Dipmmseasea", "predict result is not as expected"
+    assert r == "available aliases huge millennia announcementbid continents Epstein retention Buddhism", "predict result is not as expected"
 
 def test_do_repeated_sync(test_llm):
     r = test_llm.predict(*ARGS, **KWARGS)
-    assert r == "available Spoon announces 1929edyame fertilizer Dipmmseasea", "Call 1: predict result is not as expected"
+    assert r == "available aliases huge millennia announcementbid continents Epstein retention Buddhism", "Call 1: predict result is not as expected"
     r = test_llm.predict(*ARGS, **KWARGS)
-    assert r == "available Spoon announces 1929edyame fertilizer Dipmmseasea", "Call 2: predict result is not as expected"
+    assert r == "available aliases huge millennia announcementbid continents Epstein retention Buddhism", "Call 2: predict result is not as expected"
 
 if __name__ == "__main__":
-    if not "RUN_LC_TESTS" in os.environ:
-        print("Skipping LMTP LangChain integration tests because RUN_LC_TESTS is not set")
-        sys.exit(0)
-
     # only run this file with pytest
     sys.exit(pytest.main(args=["-s", __file__]))
\ No newline at end of file
diff --git a/src/lmql/tests/optional/langchain/test_query_args.py b/src/lmql/tests/optional/langchain/test_query_args.py
new file mode 100644
index 00000000..d5550522
--- /dev/null
+++ b/src/lmql/tests/optional/langchain/test_query_args.py
@@ -0,0 +1,62 @@
+import lmql
+from lmql.tests.expr_test_utils import run_all_tests
+
+# multi kw default
+@lmql.query
+async def multi_kw_chain(s: str = 'default', a: int = 12):
+    '''lmql
+    argmax
+        return {"result": (s, a)}
+    from
+        "chatgpt"
+    '''
+
+# multi kw default
+@lmql.query
+async def no_return_chain(s: str = 'default', a: int = 12):
+    '''lmql
+    argmax
+        "This is [R1] and [R2]"
+    from
+        "chatgpt"
+    where
+        R1 == s and R2 == "8"
+    '''
+
+def test_decorated_chain():
+    c = multi_kw_chain.aschain(output_keys=["result"])
+    
+    input_value = "Hi there"
+    a_value = 8
+
+    # as chain
+    s,a = c({"s": input_value, "a": a_value})["result"]
+    assert s == input_value, f"Expected {input_value}, got {s}"
+    assert a == a_value, f"Expected {a_value}, got {a}"
+
+    c = no_return_chain.aschain()
+    res = c({"s": input_value, "a": a_value})
+    s = res["R1"]
+    a = res["R2"]
+    assert s == input_value, f"Expected {input_value}, got {s}"
+    assert a == str(a_value), f"Expected {a_value}, got {a}"
+
+multipos_chain = lmql.query('''lmql
+argmax
+    return {"result": (s, a)}
+from
+    "chatgpt"
+'''
+, input_variables=['s', 'a']).aschain(output_keys=['result'])
+
+def test_query_args_with_str_aschain():
+    input_value = "Hi there"
+    a_value = 8
+
+    # as chain
+    s,a = multipos_chain({"s": input_value, "a": a_value})["result"]
+    assert s == input_value, f"Expected {input_value}, got {s}"
+    assert a == a_value, f"Expected {a_value}, got {a}"
+
+if __name__ == "__main__":
+    run_all_tests(globals())
\ No newline at end of file
diff --git a/src/lmql/tests/test_azure_backend.py b/src/lmql/tests/optional/openai/test_azure_backend.py
similarity index 100%
rename from src/lmql/tests/test_azure_backend.py
rename to src/lmql/tests/optional/openai/test_azure_backend.py
diff --git a/src/lmql/tests/optional/openai/test_multi_tokenizer.py b/src/lmql/tests/optional/openai/test_multi_tokenizer.py
new file mode 100644
index 00000000..98af85c5
--- /dev/null
+++ b/src/lmql/tests/optional/openai/test_multi_tokenizer.py
@@ -0,0 +1,28 @@
+"""
+Tests mixing different tokenizers/models in the same LMQL process.
+"""
+
+import lmql
+from lmql.tests.expr_test_utils import run_all_tests
+
+@lmql.query(model="chatgpt")
+async def cg():
+    '''lmql
+    "Hello[WORLD]" where len(TOKENS(WORLD)) < 3
+    return WORLD
+    '''
+
+@lmql.query(model="openai/gpt-3.5-turbo-instruct")
+async def test_gpt35():
+    '''lmql
+    "Hello[WORLD]" where len(TOKENS(WORLD)) == 4
+    r = [WORLD, cg()]
+    assert r == [", I am a", " Hello!"], "Expected {}, got {}".format(
+        [", I am a", " Hello!"],
+        r
+    )
+    return WORLD
+    '''
+
+if __name__ == "__main__":
+    run_all_tests(globals())
\ No newline at end of file
diff --git a/src/lmql/tests/test_multibyte_characters.py b/src/lmql/tests/optional/openai/test_multibyte_characters.py
similarity index 100%
rename from src/lmql/tests/test_multibyte_characters.py
rename to src/lmql/tests/optional/openai/test_multibyte_characters.py
diff --git a/src/lmql/tests/optional/openai/test_noprompt.py b/src/lmql/tests/optional/openai/test_noprompt.py
new file mode 100644
index 00000000..30cd14f4
--- /dev/null
+++ b/src/lmql/tests/optional/openai/test_noprompt.py
@@ -0,0 +1,14 @@
+import lmql
+
+from lmql.tests.expr_test_utils import run_all_tests
+
+@lmql.query(model="openai/text-ada-001")
+def test_noprompt_openai():
+    '''lmql
+    "[RESPONSE]" where len(TOKENS(RESPONSE)) < 10
+    expected = "\n\nThe first step in any software development"
+    assert RESPONSE == "\n\nThe first step in any software development", f"Expected '{expected}' got {[RESPONSE]}"
+    '''
+
+if __name__ == "__main__":
+    run_all_tests(globals())
\ No newline at end of file
diff --git a/src/lmql/tests/optional/openai/test_openai_api.py b/src/lmql/tests/optional/openai/test_openai_api.py
new file mode 100644
index 00000000..32d5daa6
--- /dev/null
+++ b/src/lmql/tests/optional/openai/test_openai_api.py
@@ -0,0 +1,17 @@
+import lmql
+import numpy as np
+
+from lmql.tests.expr_test_utils import run_all_tests
+
+def test_llm_openai():
+    try:
+        import lmql.runtime.openai_secret
+    except:
+        print("Skipping test_api.test_llm_openai because no OpenAI API configuration could be found.")
+        return
+
+    m = lmql.model("openai/text-davinci-003", silent=True)
+    assert m.score_sync("Hello", ["World", "Test"]).argmax() == "World"
+
+if __name__ == "__main__":
+    run_all_tests(globals())
\ No newline at end of file
diff --git a/src/lmql/tests/test_openai_backend.py b/src/lmql/tests/optional/openai/test_openai_backend.py
similarity index 100%
rename from src/lmql/tests/test_openai_backend.py
rename to src/lmql/tests/optional/openai/test_openai_backend.py
diff --git a/src/lmql/tests/test_sample_queries.py b/src/lmql/tests/optional/openai/test_sample_queries.py
similarity index 96%
rename from src/lmql/tests/test_sample_queries.py
rename to src/lmql/tests/optional/openai/test_sample_queries.py
index 87761b2a..92fd8c52 100644
--- a/src/lmql/tests/test_sample_queries.py
+++ b/src/lmql/tests/optional/openai/test_sample_queries.py
@@ -18,7 +18,7 @@ def load_queries():
     cwd = os.path.dirname(os.path.realpath(__file__))
     # js file to require file and console.log .queries
     contents = f"""
-    require = require("{os.path.join(cwd, "..", "ui", "playground", "src", "queries.js")}")
+    require = require("{os.path.join(cwd, "..", "..", "..", "ui", "playground", "src", "queries.js")}")
     console.log(JSON.stringify(require.queries))
     """
     # write contents to temp file
diff --git a/src/lmql/tests/queryargs/test_args.py b/src/lmql/tests/queryargs/test_args.py
index 2f8ff11e..72e735b7 100644
--- a/src/lmql/tests/queryargs/test_args.py
+++ b/src/lmql/tests/queryargs/test_args.py
@@ -167,23 +167,4 @@ async def no_return_chain(s: str = 'default', a: int = 12):
         "chatgpt"
     where
         R1 == s and R2 == "8"
-    '''
-
-
-def test_decorated_chain():
-    c = multi_kw_chain.aschain(output_keys=["result"])
-    
-    input_value = "Hi there"
-    a_value = 8
-
-    # as chain
-    s,a = c({"s": input_value, "a": a_value})["result"]
-    assert s == input_value, f"Expected {input_value}, got {s}"
-    assert a == a_value, f"Expected {a_value}, got {a}"
-
-    c = no_return_chain.aschain()
-    res = c({"s": input_value, "a": a_value})
-    s = res["R1"]
-    a = res["R2"]
-    assert s == input_value, f"Expected {input_value}, got {s}"
-    assert a == str(a_value), f"Expected {a_value}, got {a}"
\ No newline at end of file
+    '''
\ No newline at end of file
diff --git a/src/lmql/tests/queryargs/test_args_query_str.py b/src/lmql/tests/queryargs/test_args_query_str.py
index f4aea557..875e2a56 100644
--- a/src/lmql/tests/queryargs/test_args_query_str.py
+++ b/src/lmql/tests/queryargs/test_args_query_str.py
@@ -27,14 +27,6 @@
 '''
 , input_variables=['s', 'a'])
 
-multipos_chain = lmql.query('''lmql
-argmax
-    return {"result": (s, a)}
-from
-    "chatgpt"
-'''
-, input_variables=['s', 'a']).aschain(output_keys=['result'])
-
 async def test_query_args_with_str():
     input_value = "Hi there"
     a_value = 8 
@@ -70,13 +62,4 @@ async def test_query_args_with_str():
     # specify fully as kw
     s, a = (await multipos(s=input_value, a=a_value))
     assert s == input_value, f"Expected {input_value}, got {s}"
-    assert a == a_value, f"Expected {a_value}, got {a}"
-
-def test_query_args_with_str_aschain():
-    input_value = "Hi there"
-    a_value = 8
-
-    # as chain
-    s,a = multipos_chain({"s": input_value, "a": a_value})["result"]
-    assert s == input_value, f"Expected {input_value}, got {s}"
     assert a == a_value, f"Expected {a_value}, got {a}"
\ No newline at end of file
diff --git a/src/lmql/tests/test_api.py b/src/lmql/tests/test_api.py
index 85f18574..f3ec6381 100644
--- a/src/lmql/tests/test_api.py
+++ b/src/lmql/tests/test_api.py
@@ -37,9 +37,10 @@ async def test_llm_multi_generate():
         assert type(r) is str and len(r) > 0
 
 async def test_llm_generate_two_sequential():
-    llm = lmql.model("openai/text-ada-001", seed=123)
+    llm = lmql.model("random", seed=123)
     result1 = await llm.generate("Test", max_tokens=10)
     result2 = await llm.generate("Test", max_tokens=10)
+    
     assert type(result1) is str and len(result1) > 0
     assert type(result2) is str and len(result2) > 0
 
@@ -71,15 +72,5 @@ def test_llm_local():
     m = lmql.model("local:sshleifer/tiny-gpt2", silent=True)
     assert m.score_sync("Hello", ["World", "Test"]).argmax() == "Test"
 
-def test_llm_openai():
-    try:
-        import lmql.runtime.openai_secret
-    except:
-        print("Skipping test_api.test_llm_openai because no OpenAI API configuration could be found.")
-        return
-
-    m = lmql.model("openai/text-davinci-003", silent=True)
-    assert m.score_sync("Hello", ["World", "Test"]).argmax() == "World"
-
 if __name__ == "__main__":
     run_all_tests(globals())
\ No newline at end of file
diff --git a/src/lmql/tests/test_back2back_caching.py b/src/lmql/tests/test_back2back_caching.py
index 92f7eec8..f349a314 100644
--- a/src/lmql/tests/test_back2back_caching.py
+++ b/src/lmql/tests/test_back2back_caching.py
@@ -14,13 +14,18 @@ async def q():
         "3. Thought This is good\n"
         "[NUM][MODE][CONTENT]" 
     from 
-        "openai/text-davinci-003" 
+        # lmql.model("random", seed=123)
+        lmql.model("local:llama.cpp:/lmql/llama-2-7b-chat.Q2_K.gguf", tokenizer="AyyYOO/Luna-AI-Llama2-Uncensored-FP16-sharded")
     where 
-        MODE in [" Action", " Thought"] and STOPS_AT(CONTENT, "\n") and STOPS_AT(NUM, ".")
+        MODE in [" Action", " Thought"] and STOPS_AT(CONTENT, "\n") and STOPS_AT(NUM, ".") and len(TOKENS(NUM)) < 4 and len(TOKENS(CONTENT)) < 16
     '''
 
-result = lmql.main(q)[0]
-assert len(result.variables["CONTENT"]) > 5, f"Expected CONTENT to be longer than 5 characters, got {result.variables['CONTENT']}"
-stats = get_stats()
-requests = str(stats).split(",")[0].split(":")[1].strip()
-assert requests == "2 requests", f"Expected query to need 2 requests, got {requests}"
\ No newline at end of file
+with lmql.traced("back2back") as t:
+    result: lmql.LMQLResult = lmql.main(q)
+    assert len(result.variables["CONTENT"]) > 5, f"Expected CONTENT to be longer than 5 characters, got {str([result.variables['CONTENT']])}"
+
+    cert = lmql.certificate(t)
+    events = cert.asdict()["children"][0]["events"]
+
+    generate_calls = [e for e in events if e['name'] == 'lmtp.generate']
+    assert len(generate_calls) == 2, f"Expected 2 generate calls, got {len(generate_calls)}"
\ No newline at end of file
diff --git a/src/lmql/tests/test_escaping.py b/src/lmql/tests/test_escaping.py
index 132d5c6b..05edbd0a 100644
--- a/src/lmql/tests/test_escaping.py
+++ b/src/lmql/tests/test_escaping.py
@@ -8,9 +8,9 @@ def test_curly_braces():
     argmax 
         value = "[abc]"
         "{{ Say {value} 'this is a test':[RESPONSE] }}"
-        assert context.prompt == "{ Say [abc] 'this is a test':\n\nThis is a test. }"
+        assert context.prompt == "{ Say [abc] 'this is a test': inspires dental maneuver attracted calculatesMonlearning triangles hiber }", f"Got {context.prompt}"
     from 
-        "openai/text-ada-001" 
+        lmql.model("random", seed=123)
     where 
         len(TOKENS(RESPONSE)) < 10
     '''
@@ -22,7 +22,7 @@ def test_curly_only():
         "{{ Say }}"
         assert context.prompt == "{ Say }"
     from 
-        "openai/text-ada-001" 
+        lmql.model("random", seed=123)
     '''
 
 
@@ -33,7 +33,7 @@ def test_square_only():
         "[[Say]]"
         assert context.prompt == "[Say]"
     from 
-        "openai/text-ada-001" 
+        lmql.model("random", seed=123)
     '''
 
 @lmql.query
@@ -43,7 +43,7 @@ def test_square_with_var_only():
         "[[[Say]]]"
         assert context.prompt == "[Hello]"
     from 
-        "openai/text-ada-001" 
+        lmql.model("random", seed=123)
     where
         Say == "Hello"
     '''
@@ -55,7 +55,7 @@ def test_square_in_constraint():
         person = "test"
         "Hello {person}, my name is [NAME]. Nice to meet you!"
     from
-        "openai/text-ada-001"
+        lmql.model("random", seed=123)
     where
         NAME in ["["]
     '''
@@ -78,9 +78,9 @@ def test_json_decoding():
         import json
         json.loads(context.prompt.split(":",1)[1])
     from
-        "openai/text-davinci-003" 
+        lmql.model("random", seed=123)
     where
-        STOPS_BEFORE(STRING_VALUE, '"') and INT(INT_VALUE) and len(TOKENS(INT_VALUE)) < 2
+        STOPS_BEFORE(STRING_VALUE, '"') and INT(INT_VALUE) and len(TOKENS(INT_VALUE)) < 2 and len(TOKENS(STRING_VALUE)) < 10
     '''
 
 run_all_tests(globals())
\ No newline at end of file
diff --git a/src/lmql/tests/test_multi_tokenizer.py b/src/lmql/tests/test_multi_tokenizer.py
index c455f68c..e5ce60bb 100644
--- a/src/lmql/tests/test_multi_tokenizer.py
+++ b/src/lmql/tests/test_multi_tokenizer.py
@@ -43,24 +43,5 @@ async def test_llama_from_gpt():
     )
     '''
 
-@lmql.query(model="chatgpt")
-async def cg():
-    '''lmql
-    "Hello[WORLD]" where len(TOKENS(WORLD)) < 3
-    return WORLD
-    '''
-
-@lmql.query(model="openai/gpt-3.5-turbo-instruct")
-async def test_gpt35():
-    '''lmql
-    "Hello[WORLD]" where len(TOKENS(WORLD)) == 4
-    r = [WORLD, cg()]
-    assert r == [", I am a", " Hello!"], "Expected {}, got {}".format(
-        [", I am a", " Hello!"],
-        r
-    )
-    return WORLD
-    '''
-
 if __name__ == "__main__":
     run_all_tests(globals())
\ No newline at end of file
diff --git a/src/lmql/tests/test_noprompt.py b/src/lmql/tests/test_noprompt.py
index 8a1d72c6..e688fc3b 100644
--- a/src/lmql/tests/test_noprompt.py
+++ b/src/lmql/tests/test_noprompt.py
@@ -26,14 +26,6 @@ async def test_noprompt_beam():
     rs = await noprompt_beam()
     assert [r.variables['RESPONSE'] for r in rs] == [' Protocol', ' Protocol radi'], f"Expected fixed random value but got {[r.variables['RESPONSE'] for r in rs]}"
 
-@lmql.query(model="openai/text-ada-001")
-def test_noprompt_openai():
-    '''lmql
-    "[RESPONSE]" where len(TOKENS(RESPONSE)) < 10
-    expected = "\n\nThe first step in any software development"
-    assert RESPONSE == "\n\nThe first step in any software development", f"Expected '{expected}' got {[RESPONSE]}"
-    '''
-
 @lmql.query(model=lmql.model("random", seed=123))
 def test_noprompt_with_constraints():
     '''lmql
diff --git a/src/lmql/tests/test_orop.py b/src/lmql/tests/test_orop.py
index 7a1a5b9d..f336d203 100644
--- a/src/lmql/tests/test_orop.py
+++ b/src/lmql/tests/test_orop.py
@@ -8,7 +8,7 @@ async def q():
         sample(temperature=0.8, chunksize=30, max_len=128)
             "The movie review in positive sentiment is: '[OUTPUT]"
         FROM
-            "openai/text-ada-001"
+            lmql.model("local:llama.cpp:/lmql/llama-2-7b-chat.Q2_K.gguf", tokenizer="AyyYOO/Luna-AI-Llama2-Uncensored-FP16-sharded")
         WHERE
             len(TOKENS(OUTPUT)) < 20 or len(TOKENS(OUTPUT)) < 30
         '''
diff --git a/src/lmql/tests/test_query_args.py b/src/lmql/tests/test_query_args.py
index 62475951..a731081b 100644
--- a/src/lmql/tests/test_query_args.py
+++ b/src/lmql/tests/test_query_args.py
@@ -1,10 +1,10 @@
 from lmql.tests.expr_test_utils import run_all_tests
 from lmql.tests.queryargs.test_var_errors import test_var_errors
 from lmql.tests.queryargs.test_args_run import test_lmql_run_args
-from lmql.tests.queryargs.test_args_query_str import test_query_args_with_str, test_query_args_with_str_aschain
+from lmql.tests.queryargs.test_args_query_str import test_query_args_with_str
 from lmql.tests.queryargs.test_sync import *
 
-from lmql.tests.queryargs.test_args import test_query_args, test_decorated_chain
+from lmql.tests.queryargs.test_args import test_query_args
 
 if __name__ == "__main__":
     run_all_tests(globals())
\ No newline at end of file
diff --git a/src/lmql/tests/test_scoping.py b/src/lmql/tests/test_scoping.py
index 31e6b7c4..378d175d 100644
--- a/src/lmql/tests/test_scoping.py
+++ b/src/lmql/tests/test_scoping.py
@@ -18,7 +18,7 @@ class A: pass
         assert "actual_input" in locals().keys(), f"Input variable actual_input not in local scope. locals: {locals()}"
         assert "actual_input" not in globals().keys(), f"Input variable actual_input not in global scope. locals: {locals()}"
     from 
-        "openai/text-ada-001"
+        lmql.model("random", seed=123)
     '''
 
 async def test_decode_clause_scoping():
@@ -27,7 +27,7 @@ async def test_decode_clause_scoping():
         "Q: Hi {n}. A: [WHO]"
         assert "n" in locals().keys(), f"Input variable n not captured by scope"
     from 
-        "openai/text-ada-001" 
+        lmql.model("random", seed=123)
     where
         len(TOKENS(WHO)) < 10
     '''
@@ -38,7 +38,7 @@ async def test_output_vars():
     @lmql.query
     async def q():
         '''lmql
-        argmax "Q: Hi {n}. A: [WHO]" from "openai/text-ada-001" where len(TOKENS(WHO)) < 10
+        argmax "Q: Hi {n}. A: [WHO]" from lmql.model("random", seed=123) where len(TOKENS(WHO)) < 10
         '''
     assert q.output_variables == ['WHO'], f"Expected output variables to be ['WHO'], got {q.output_variables}"
 
@@ -52,7 +52,7 @@ async def q():
             "Q: Hi {f(1)}. A: [WHO]" 
             assert context.prompt.startswith("Q: Hi 13")
         from 
-            "openai/text-ada-001" 
+            lmql.model("random", seed=123) 
         where 
             len(TOKENS(WHO)) < 10
         '''
diff --git a/src/lmql/tests/test_stopping.py b/src/lmql/tests/test_stopping.py
index 7b7766ea..3846036f 100644
--- a/src/lmql/tests/test_stopping.py
+++ b/src/lmql/tests/test_stopping.py
@@ -8,66 +8,63 @@ async def q():
         sample(temperature=0.8, max_len=64)
             "The movie review in positive sentiment is: [OUTPUT]"
         FROM
-            "openai/text-ada-001"
+            lmql.model("local:llama.cpp:/lmql/llama-2-7b-chat.Q2_K.gguf", tokenizer="AyyYOO/Luna-AI-Llama2-Uncensored-FP16-sharded")
         WHERE
             STOPS_BEFORE(OUTPUT, "\n") and STOPS_BEFORE(OUTPUT, "n") and len(TOKENS(OUTPUT)) < 10
         '''
     (await q())[0]
 
+@lmql.query
 async def test_stopping_double_match():
-    @lmql.query
-    async def q():
-        '''lmql
-        argmax 
-            '{{"1","2","3",[COMPL]}}'
-        from 
-            "openai/text-ada-001" 
-        where 
-            len(TOKENS(COMPL)) < 10 and STOPS_AT(COMPL, '"')
-        '''
-    assert (await q())[0].variables["COMPL"] == '4"'
+    '''lmql
+    argmax 
+        '{{"1","2","3","[COMPL]}}'
+        assert COMPL == '4"', "Expected COMPL to be '4\"', but was " + str([COMPL])
+    from 
+        lmql.model("local:llama.cpp:/lmql/llama-2-7b-chat.Q2_K.gguf", tokenizer="AyyYOO/Luna-AI-Llama2-Uncensored-FP16-sharded") 
+    where 
+        len(TOKENS(COMPL)) < 10 and STOPS_AT(COMPL, '"')
+    '''
 
+@lmql.query
 async def test_stopping_double_match_before():
-    @lmql.query
-    async def q():
-        '''lmql
-        argmax 
-            '{{"1","2","3",[COMPL]"}}'
-        from 
-            "openai/text-ada-001" 
-        where 
-            len(TOKENS(COMPL)) < 10 and STOPS_BEFORE(COMPL, '"')
-        '''
-    assert (await q())[0].variables["COMPL"] == '4'
+    '''lmql
+    argmax 
+        '{{"1","2","3","[COMPL]"}}'
+        assert COMPL == '4', "Expected COMPL to be '4', but was " + str([COMPL])
+    from 
+        lmql.model("local:llama.cpp:/lmql/llama-2-7b-chat.Q2_K.gguf", tokenizer="AyyYOO/Luna-AI-Llama2-Uncensored-FP16-sharded") 
+    where 
+        len(TOKENS(COMPL)) < 10 and STOPS_BEFORE(COMPL, '"')
+    '''
 
+@lmql.query
 async def test_stopping_single_match():
-    @lmql.query
-    async def q():
-        '''lmql
-        argmax 
-            """
-            My name is Peter. In JSON:
-            {{
-                "name": "[COMPL]
-            }}
-            """
-        from 
-            "openai/text-ada-001" 
-        where 
-            STOPS_AT(COMPL, '"')
-        '''
-    assert (await q())[0].variables["COMPL"] == 'Peter"'
+    '''lmql
+    argmax 
+        """
+        My name is Peter. In JSON:
+        {{
+            "name": "[COMPL]
+        }}
+        """
+        assert COMPL == 'Peter"', "Expected COMPL to be 'Peter\"', but was " + str([COMPL])
+    from 
+        lmql.model("local:llama.cpp:/lmql/llama-2-7b-chat.Q2_K.gguf", tokenizer="AyyYOO/Luna-AI-Llama2-Uncensored-FP16-sharded") 
+    where 
+        STOPS_AT(COMPL, '"')
+    '''
 
 @lmql.query
 async def test_conditional_stopping():
     '''lmql
     argmax 
         "The movie review in positive sentiment is: [OUTPUT]"
-        assert OUTPUT.count("The") == 2
+        assert OUTPUT.count("review") == 2
     from 
-        "openai/text-ada-001" 
+        lmql.model("local:llama.cpp:/lmql/llama-2-7b-chat.Q2_K.gguf", tokenizer="AyyYOO/Luna-AI-Llama2-Uncensored-FP16-sharded") 
     where 
-        len(TOKENS(OUTPUT)) > 18 and STOPS_AT(OUTPUT, "The")
+        len(TOKENS(OUTPUT)) > 21 and STOPS_AT(OUTPUT, "review")
     '''
 
 @lmql.query
@@ -75,11 +72,11 @@ async def test_conditional_or_stopping():
     '''lmql
     argmax 
         "The movie review in positive sentiment is: [OUTPUT]"
-        assert OUTPUT.count("The") == 1
+        assert OUTPUT.count("review") == 1
     from 
-        "openai/text-ada-001" 
+        lmql.model("local:llama.cpp:/lmql/llama-2-7b-chat.Q2_K.gguf", tokenizer="AyyYOO/Luna-AI-Llama2-Uncensored-FP16-sharded") 
     where 
-        len(TOKENS(OUTPUT)) > 20 or STOPS_AT(OUTPUT, "The")
+        len(TOKENS(OUTPUT)) > 10 or STOPS_AT(OUTPUT, "review")
     '''
 
 @lmql.query
@@ -87,11 +84,11 @@ async def test_double_stop_rewrite():
     '''lmql
     argmax 
         "The movie review in positive sentiment is: [OUTPUT] Here"
-        assert OUTPUT.endswith("The")
+        assert OUTPUT.endswith("5/5.")
     from 
-        "openai/text-ada-001" 
+        lmql.model("local:llama.cpp:/lmql/llama-2-7b-chat.Q2_K.gguf", tokenizer="AyyYOO/Luna-AI-Llama2-Uncensored-FP16-sharded") 
     where 
-        STOPS_BEFORE(OUTPUT, " ") and STOPS_AT(OUTPUT, "re")
+        STOPS_BEFORE(OUTPUT, " ") and STOPS_AT(OUTPUT, "Unter")
     '''
 
 @lmql.query
@@ -101,7 +98,7 @@ async def test_double_stop_rewrite_space():
         "The movie review in positive sentiment is: [OUTPUT] Here"
         assert OUTPUT.endswith(" ")
     from 
-        "openai/text-ada-001" 
+        lmql.model("local:llama.cpp:/lmql/llama-2-7b-chat.Q2_K.gguf", tokenizer="AyyYOO/Luna-AI-Llama2-Uncensored-FP16-sharded") 
     where 
         STOPS_AT(OUTPUT, " ") and STOPS_AT(OUTPUT, "re")
     '''