Implement vision-based AI chat capabilities using the z-ai-web-dev-sdk. Use this skill when the user needs to analyze images, describe visual content, or create applications that combine image understanding with conversational AI. Supports image URLs and base64 encoded images for multimodal interactions.
This skill guides the implementation of vision chat functionality using the z-ai-web-dev-sdk package, enabling AI models to understand and respond to images combined with text prompts.
Skill Location: {project_path}/skills/VLM
this skill is located at above path in your project.
Reference Scripts: Example test scripts are available in the {Skill Location}/scripts/ directory for quick testing and reference. See {Skill Location}/scripts/vlm.ts for a working example.
Vision Chat allows you to build applications that can analyze images, extract information from visual content, and answer questions about images through natural language conversation.
IMPORTANT: z-ai-web-dev-sdk MUST be used in backend code only. Never use it in client-side code.
The z-ai-web-dev-sdk package is already installed. Import it as shown in the examples below.
For simple image analysis tasks, you can use the z-ai CLI instead of writing code. This is ideal for quick image descriptions, testing vision capabilities, or simple automation.
# Describe an image from URL
z-ai vision --prompt "What's in this image?" --image "https://example.com/photo.jpg"
# Using short options
z-ai vision -p "Describe this image" -i "https://example.com/image.png"# Analyze a local image file
z-ai vision -p "What objects are in this photo?" -i "./photo.jpg"
# Save response to file
z-ai vision -p "Describe the scene" -i "./landscape.png" -o description.json# Analyze multiple images at once
z-ai vision \
-p "Compare these two images" \
-i "./photo1.jpg" \
-i "./photo2.jpg" \
-o comparison.json
# Multiple images with detailed analysis
z-ai vision \
--prompt "What are the differences between these images?" \
--image "https://example.com/before.jpg" \
--image "https://example.com/after.jpg"# Enable thinking for complex visual reasoning
z-ai vision \
-p "Count the number of people in this image and describe their activities" \
-i "./crowd.jpg" \
--thinking \
-o analysis.json# Stream the vision analysis
z-ai vision -p "Describe this image in detail" -i "./photo.jpg" --stream--prompt, -p <text>: Required - Question or instruction about the image(s)--image, -i <URL or path>: Optional - Image URL or local file path (can be used multiple times)--thinking, -t: Optional - Enable chain-of-thought reasoning (default: disabled)--output, -o <path>: Optional - Output file path (JSON format)--stream: Optional - Stream the response in real-timeUse CLI for:
Use SDK for:
For better performance and reliability, use base64 encoding to pass images to the model instead of image URLs.
The Vision Chat API supports three types of media content:
Use this type for static images (PNG, JPEG, GIF, WebP, etc.)
{
role: 'user',
content: [
{ type: 'text', text: prompt },
{ type: 'image_url', image_url: { url: imageUrl } }
]
}Use this type for video content (MP4, AVI, MOV, etc.)
{
role: 'user',
content: [
{ type: 'text', text: prompt },
{ type: 'video_url', video_url: { url: videoUrl } }
]
}Use this type for document files (PDF, DOCX, TXT, etc.)
{
role: 'user',
content: [
{ type: 'text', text: prompt },
{ type: 'file_url', file_url: { url: fileUrl } }
]
}Note: You can combine multiple content types in a single message. For example, you can include both text and multiple images, or text with both an image and a document.
import ZAI from 'z-ai-web-dev-sdk';
async function analyzeImage(imageUrl, question) {
const zai = await ZAI.create();
const response = await zai.chat.completions.createVision({
messages: [
{
role: 'user',
content: [
{
type: 'text',
text: question
},
{
type: 'image_url',
image_url: {
url: imageUrl
}
}
]
}
],
thinking: { type: 'disabled' }
});
return response.choices[0]?.message?.content;
}
// Usage
const result = await analyzeImage(
'https://example.com/product.jpg',
'Describe this product in detail'
);
console.log('Analysis:', result);import ZAI from 'z-ai-web-dev-sdk';
async function compareImages(imageUrls, question) {
const zai = await ZAI.create();
const content = [
{
type: 'text',
text: question
},
...imageUrls.map(url => ({
type: 'image_url',
image_url: { url }
}))
];
const response = await zai.chat.completions.createVision({
messages: [
{
role: 'user',
content: content
}
],
thinking: { type: 'disabled' }
});
return response.choices[0]?.message?.content;
}
// Usage
const comparison = await compareImages(
[
'https://example.com/before.jpg',
'https://example.com/after.jpg'
],
'Compare these two images and describe the differences'
);import ZAI from 'z-ai-web-dev-sdk';
import fs from 'fs';
async function analyzeLocalImage(imagePath, question) {
const zai = await ZAI.create();
// Read image file and convert to base64
const imageBuffer = fs.readFileSync(imagePath);
const base64Image = imageBuffer.toString('base64');
const mimeType = imagePath.endsWith('.png') ? 'image/png' : 'image/jpeg';
const response = await zai.chat.completions.createVision({
messages: [
{
role: 'user',
content: [
{
type: 'text',
text: question
},
{
type: 'image_url',
image_url: {
url: `data:${mimeType};base64,${base64Image}`
}
}
]
}
],
thinking: { type: 'disabled' }
});
return response.choices[0]?.message?.content;
}import ZAI from 'z-ai-web-dev-sdk';
class VisionChatSession {
constructor() {
this.messages = [];
}
async initialize() {
this.zai = await ZAI.create();
}
async addImage(imageUrl, initialQuestion) {
this.messages.push({
role: 'user',
content: [
{
type: 'text',
text: initialQuestion
},
{
type: 'image_url',
image_url: { url: imageUrl }
}
]
});
return this.getResponse();
}
async followUp(question) {
this.messages.push({
role: 'user',
content: [
{
type: 'text',
text: question
}
]
});
return this.getResponse();
}
async getResponse() {
const response = await this.zai.chat.completions.createVision({
messages: this.messages,
thinking: { type: 'disabled' }
});
const assistantMessage = response.choices[0]?.message?.content;
this.messages.push({
role: 'assistant',
content: assistantMessage
});
return assistantMessage;
}
}
// Usage
const session = new VisionChatSession();
await session.initialize();
const initial = await session.addImage(
'https://example.com/chart.jpg',
'What does this chart show?'
);
console.log('Initial analysis:', initial);
const followup = await session.followUp('What are the key trends?');
console.log('Follow-up:', followup);import ZAI from 'z-ai-web-dev-sdk';
async function classifyImage(imageUrl) {
const zai = await ZAI.create();
const prompt = `Analyze this image and provide:
1. Main subject/category
2. Key objects detected
3. Scene description
4. Suggested tags (comma-separated)
Format your response as JSON.`;
const response = await zai.chat.completions.createVision({
messages: [
{
role: 'user',
content: [
{
type: 'text',
text: prompt
},
{
type: 'image_url',
image_url: { url: imageUrl }
}
]
}
],
thinking: { type: 'disabled' }
});
const content = response.choices[0]?.message?.content;
try {
return JSON.parse(content);
} catch (e) {
return { rawResponse: content };
}
}import ZAI from 'z-ai-web-dev-sdk';
async function extractText(imageUrl) {
const zai = await ZAI.create();
const response = await zai.chat.completions.createVision({
messages: [
{
role: 'user',
content: [
{
type: 'text',
text: 'Extract all text from this image. Preserve the layout and formatting as much as possible.'
},
{
type: 'image_url',
image_url: { url: imageUrl }
}
]
}
],
thinking: { type: 'disabled' }
});
return response.choices[0]?.message?.content;
}async function safeVisionChat(imageUrl, question) {
try {
const zai = await ZAI.create();
const response = await zai.chat.completions.createVision({
messages: [
{
role: 'user',
content: [
{ type: 'text', text: question },
{ type: 'image_url', image_url: { url: imageUrl } }
]
}
],
thinking: { type: 'disabled' }
});
return {
success: true,
content: response.choices[0]?.message?.content
};
} catch (error) {
console.error('Vision chat error:', error);
return {
success: false,
error: error.message
};
}
}import express from 'express';
import ZAI from 'z-ai-web-dev-sdk';
const app = express();
app.use(express.json());
let zaiInstance;
// Initialize SDK once
async function initZAI() {
zaiInstance = await ZAI.create();
}
app.post('/api/analyze-image', async (req, res) => {
try {
const { imageUrl, question } = req.body;
if (!imageUrl || !question) {
return res.status(400).json({
error: 'imageUrl and question are required'
});
}
const response = await zaiInstance.chat.completions.createVision({
messages: [
{
role: 'user',
content: [
{ type: 'text', text: question },
{ type: 'image_url', image_url: { url: imageUrl } }
]
}
],
thinking: { type: 'disabled' }
});
res.json({
success: true,
analysis: response.choices[0]?.message?.content
});
} catch (error) {
res.status(500).json({
success: false,
error: error.message
});
}
});
initZAI().then(() => {
app.listen(3000, () => {
console.log('Vision chat API running on port 3000');
});
});Issue: "SDK must be used in backend"
Issue: Image not loading or being analyzed
Issue: Poor analysis quality
Issue: Slow response times
07048a9
If you maintain this skill, you can claim it as your own. Once claimed, you can manage eval scenarios, bundle related skills, attach documentation or rules, and ensure cross-agent compatibility.