From 11fe3932ded96133af7e1305c091766599988ac8 Mon Sep 17 00:00:00 2001 From: Abirdcfly Date: Mon, 20 Nov 2023 13:57:44 +0800 Subject: [PATCH] fix: knowledgebase should use versioneddataset Signed-off-by: Abirdcfly --- api/v1alpha1/knowledgebase_types.go | 5 +- ...dia.kubeagi.k8s.com.cn_knowledgebases.yaml | 5 +- config/rbac/role.yaml | 14 ++ config/samples/arcadia_v1alpha1_dataset.yaml | 8 +- .../arcadia_v1alpha1_knowledgebase.yaml | 9 +- .../arcadia_v1alpha1_versioneddataset.yaml | 17 ++- controllers/knowledgebase_controller.go | 142 ++++++++---------- ...dia.kubeagi.k8s.com.cn_knowledgebases.yaml | 5 +- deploy/charts/arcadia/templates/rbac.yaml | 14 ++ pkg/documentloaders/qa_csv.go | 95 ++++++++++++ pkg/documentloaders/qa_csv_test.go | 43 ++++++ pkg/documentloaders/testdata/qa.csv | 26 ++++ pkg/scheduler/executor.go | 14 +- tests/example-test.sh | 54 ++++--- tests/knowledgebase-1.txt | 11 -- 15 files changed, 333 insertions(+), 129 deletions(-) create mode 100644 pkg/documentloaders/qa_csv.go create mode 100644 pkg/documentloaders/qa_csv_test.go create mode 100644 pkg/documentloaders/testdata/qa.csv delete mode 100644 tests/knowledgebase-1.txt diff --git a/api/v1alpha1/knowledgebase_types.go b/api/v1alpha1/knowledgebase_types.go index aef8a5063..d164a45a2 100644 --- a/api/v1alpha1/knowledgebase_types.go +++ b/api/v1alpha1/knowledgebase_types.go @@ -30,11 +30,12 @@ type KnowledgeBaseSpec struct { // VectorStore defines the vectorstore to store results VectorStore *TypedObjectReference `json:"vectorStore,omitempty"` - // FileGroups included files Grouped by Datasource + // FileGroups included files Grouped by VersionedDataset FileGroups []FileGroup `json:"fileGroups,omitempty"` } + type FileGroupDetail struct { - // From defines the source which provides these files + // From defines the datasource which provides these files Source *TypedObjectReference `json:"source,omitempty"` // FileDetails is the detail files diff --git a/config/crd/bases/arcadia.kubeagi.k8s.com.cn_knowledgebases.yaml b/config/crd/bases/arcadia.kubeagi.k8s.com.cn_knowledgebases.yaml index c63c1bb54..97091a403 100644 --- a/config/crd/bases/arcadia.kubeagi.k8s.com.cn_knowledgebases.yaml +++ b/config/crd/bases/arcadia.kubeagi.k8s.com.cn_knowledgebases.yaml @@ -71,7 +71,7 @@ spec: - name type: object fileGroups: - description: FileGroups included files Grouped by Datasource + description: FileGroups included files Grouped by VersionedDataset items: properties: paths: @@ -197,7 +197,8 @@ spec: type: object type: array source: - description: From defines the source which provides these files + description: From defines the datasource which provides these + files properties: apiGroup: description: APIGroup is the group for the resource being diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index a5c07dfc4..cf2c1aba3 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -305,6 +305,20 @@ rules: - get - patch - update +- apiGroups: + - arcadia.kubeagi.k8s.com.cn + resources: + - versioneddataset + verbs: + - get + - list + - watch +- apiGroups: + - arcadia.kubeagi.k8s.com.cn + resources: + - versioneddataset/status + verbs: + - get - apiGroups: - arcadia.kubeagi.k8s.com.cn resources: diff --git a/config/samples/arcadia_v1alpha1_dataset.yaml b/config/samples/arcadia_v1alpha1_dataset.yaml index 6fa860886..50d2ad103 100644 --- a/config/samples/arcadia_v1alpha1_dataset.yaml +++ b/config/samples/arcadia_v1alpha1_dataset.yaml @@ -1,6 +1,10 @@ apiVersion: arcadia.kubeagi.k8s.com.cn/v1alpha1 kind: Dataset metadata: - name: dataset-sample + name: dataset-playground + namespace: arcadia spec: - # TODO(user): Add fields here + contentType: text + description: test dataset management + displayName: dataset example + field: finance diff --git a/config/samples/arcadia_v1alpha1_knowledgebase.yaml b/config/samples/arcadia_v1alpha1_knowledgebase.yaml index 99215cc45..aee810206 100644 --- a/config/samples/arcadia_v1alpha1_knowledgebase.yaml +++ b/config/samples/arcadia_v1alpha1_knowledgebase.yaml @@ -7,20 +7,17 @@ spec: displayName: "测试 KnowledgeBase" description: "测试 KnowledgeBase" embedder: - apiGroup: arcadia.kubeagi.k8s.com.cn/v1alpha1 kind: Embedders name: zhipuai-embedders-sample namespace: arcadia vectorStore: - apiGroup: arcadia.kubeagi.k8s.com.cn/v1alpha1 kind: VectorStores name: chroma-sample namespace: arcadia fileGroups: - source: - apiGroup: arcadia.kubeagi.k8s.com.cn/v1alpha1 - kind: Datasources - name: arcadia-local + kind: VersionedDataset + name: dataset-playground-v1 namespace: arcadia paths: - - example-test/knowledgebase-1.txt + - dataset/dataset-playground/v1/qa.csv diff --git a/config/samples/arcadia_v1alpha1_versioneddataset.yaml b/config/samples/arcadia_v1alpha1_versioneddataset.yaml index 1324c257f..9fb37c2fc 100644 --- a/config/samples/arcadia_v1alpha1_versioneddataset.yaml +++ b/config/samples/arcadia_v1alpha1_versioneddataset.yaml @@ -1,6 +1,19 @@ apiVersion: arcadia.kubeagi.k8s.com.cn/v1alpha1 kind: VersionedDataset metadata: - name: versioneddataset-sample + name: dataset-playground-v1 + namespace: arcadia spec: - # TODO(user): Add fields here + dataset: + kind: Dataset + name: dataset-playground + namespace: arcadia + fileGroups: + - source: + kind: Datasource + name: arcadia-local + namespace: arcadia + paths: + - qa.csv + released: 0 + version: v1 diff --git a/controllers/knowledgebase_controller.go b/controllers/knowledgebase_controller.go index 9ea1c565a..9813ad16b 100644 --- a/controllers/knowledgebase_controller.go +++ b/controllers/knowledgebase_controller.go @@ -42,6 +42,7 @@ import ( arcadiav1alpha1 "github.com/kubeagi/arcadia/api/v1alpha1" "github.com/kubeagi/arcadia/pkg/config" "github.com/kubeagi/arcadia/pkg/datasource" + pkgdocumentloaders "github.com/kubeagi/arcadia/pkg/documentloaders" "github.com/kubeagi/arcadia/pkg/embeddings" zhipuaiembeddings "github.com/kubeagi/arcadia/pkg/embeddings/zhipuai" "github.com/kubeagi/arcadia/pkg/llms/zhipuai" @@ -54,11 +55,10 @@ const ( ) var ( - errNoDataSource = fmt.Errorf("no datasource") - errDataSourceTypeUnkonwn = fmt.Errorf("unknown datasource type") - errDataSourceNotReady = fmt.Errorf("datasource is not ready") - errEmbedderNotReady = fmt.Errorf("embedder is not ready") - errVectorStoreNotReady = fmt.Errorf("vector store is not ready") + errNoSource = fmt.Errorf("no source") + errDataSourceNotReady = fmt.Errorf("datasource is not ready") + errEmbedderNotReady = fmt.Errorf("embedder is not ready") + errVectorStoreNotReady = fmt.Errorf("vector store is not ready") ) // KnowledgeBaseReconciler reconciles a KnowledgeBase object @@ -72,8 +72,8 @@ type KnowledgeBaseReconciler struct { //+kubebuilder:rbac:groups=arcadia.kubeagi.k8s.com.cn,resources=knowledgebases/finalizers,verbs=update //+kubebuilder:rbac:groups=arcadia.kubeagi.k8s.com.cn,resources=embedders,verbs=get;list;watch //+kubebuilder:rbac:groups=arcadia.kubeagi.k8s.com.cn,resources=embedders/status,verbs=get -//+kubebuilder:rbac:groups=arcadia.kubeagi.k8s.com.cn,resources=datasources,verbs=get;list;watch -//+kubebuilder:rbac:groups=arcadia.kubeagi.k8s.com.cn,resources=datasources/status,verbs=get +//+kubebuilder:rbac:groups=arcadia.kubeagi.k8s.com.cn,resources=versioneddataset,verbs=get;list;watch +//+kubebuilder:rbac:groups=arcadia.kubeagi.k8s.com.cn,resources=versioneddataset/status,verbs=get //+kubebuilder:rbac:groups=arcadia.kubeagi.k8s.com.cn,resources=vectorstores,verbs=get;list;watch //+kubebuilder:rbac:groups=arcadia.kubeagi.k8s.com.cn,resources=vectorstores/status,verbs=get @@ -227,28 +227,39 @@ func (r *KnowledgeBaseReconciler) setCondition(kb *arcadiav1alpha1.KnowledgeBase func (r *KnowledgeBaseReconciler) reconcileFileGroup(ctx context.Context, log logr.Logger, kb *arcadiav1alpha1.KnowledgeBase, vectorStore *arcadiav1alpha1.VectorStore, embedder *arcadiav1alpha1.Embedder, group arcadiav1alpha1.FileGroup) (err error) { defer func() { if err != nil { - err = fmt.Errorf("failed to reconcile FileGroup.DataSource: %s: %w", group.Source.Name, err) + err = fmt.Errorf("failed to reconcile FileGroup.Source: %s: %w", group.Source.Name, err) } }() if group.Source == nil { - return errNoDataSource + return errNoSource } - dataSource := &arcadiav1alpha1.Datasource{} + versionedDataset := &arcadiav1alpha1.VersionedDataset{} ns := group.Source.GetNamespace() - if err = r.Get(ctx, types.NamespacedName{Name: group.Source.Name, Namespace: ns}, dataSource); err != nil { + if err = r.Get(ctx, types.NamespacedName{Name: group.Source.Name, Namespace: ns}, versionedDataset); err != nil { if errors.IsNotFound(err) { - return errNoDataSource + return errNoSource } else { return err } } - if !dataSource.Status.IsReady() { + if !versionedDataset.Status.IsReady() { return errDataSourceNotReady } - if dataSource.Spec.Type() == arcadiav1alpha1.DatasourceTypeUnknown { - return errDataSourceTypeUnkonwn + + system, err := config.GetSystemDatasource(ctx, r.Client) + if err != nil { + return err + } + endpoint := system.Spec.Enpoint.DeepCopy() + if endpoint != nil && endpoint.AuthSecret != nil { + endpoint.AuthSecret.WithNameSpace(system.Namespace) + } + ds, err := datasource.NewLocal(ctx, r.Client, endpoint) + if err != nil { + return err } + info := &arcadiav1alpha1.OSS{Bucket: ns} if len(kb.Status.FileGroupDetail) == 0 { kb.Status.FileGroupDetail = make([]arcadiav1alpha1.FileGroupDetail, 1) @@ -257,7 +268,7 @@ func (r *KnowledgeBaseReconciler) reconcileFileGroup(ctx context.Context, log lo var fileGroupDetail *arcadiav1alpha1.FileGroupDetail pathMap := make(map[string]*arcadiav1alpha1.FileDetails, 1) for i, detail := range kb.Status.FileGroupDetail { - if detail.Source != nil && detail.Source.Name == dataSource.Name && detail.Source.GetNamespace() == dataSource.GetNamespace() { + if detail.Source != nil && detail.Source.Name == versionedDataset.Name && detail.Source.GetNamespace() == versionedDataset.GetNamespace() { fileGroupDetail = &kb.Status.FileGroupDetail[i] for i, detail := range fileGroupDetail.FileDetails { pathMap[detail.Path] = &fileGroupDetail.FileDetails[i] // FIXME 这样对不? @@ -271,35 +282,6 @@ func (r *KnowledgeBaseReconciler) reconcileFileGroup(ctx context.Context, log lo kb.Status.FileGroupDetail = append(kb.Status.FileGroupDetail, *fileGroupDetail) } - var ds datasource.Datasource - info := &arcadiav1alpha1.OSS{} - switch dataSource.Spec.Type() { - case arcadiav1alpha1.DatasourceTypeLocal: - system, err := config.GetSystemDatasource(ctx, r.Client) - if err != nil { - return err - } - endpoint := system.Spec.Enpoint.DeepCopy() - if endpoint != nil && endpoint.AuthSecret != nil { - endpoint.AuthSecret.WithNameSpace(system.Namespace) - } - ds, err = datasource.NewLocal(ctx, r.Client, endpoint) - if err != nil { - return err - } - info = &arcadiav1alpha1.OSS{Bucket: dataSource.Namespace} - case arcadiav1alpha1.DatasourceTypeOSS: - endpoint := dataSource.Spec.Enpoint.DeepCopy() - // set auth secret's namespace to the datasource's namespace - if endpoint.AuthSecret != nil { - endpoint.AuthSecret.WithNameSpace(dataSource.Namespace) - } - ds, err = datasource.NewOSS(ctx, r.Client, endpoint) - if err != nil { - return err - } - info = dataSource.Spec.OSS.DeepCopy() - } errs := make([]error, 0) for _, path := range group.Paths { fileDatail, ok := pathMap[path] @@ -321,42 +303,40 @@ func (r *KnowledgeBaseReconciler) reconcileFileGroup(ctx context.Context, log lo fileDatail.UpdateErr(err) continue } - switch dataSource.Spec.Type() { - case arcadiav1alpha1.DatasourceTypeLocal, arcadiav1alpha1.DatasourceTypeOSS: - objectStat, ok := stat.(minio.ObjectInfo) - log.V(0).Info(fmt.Sprintf("minio StatFile:%#v", objectStat), "path", path) - if !ok { - err = fmt.Errorf("failed to convert stat to minio.ObjectInfo:%s", path) - errs = append(errs, err) - fileDatail.UpdateErr(err) - continue - } - if objectStat.ETag == fileDatail.Checksum { - fileDatail.LastUpdateTime = metav1.Now() - continue - } - fileDatail.Checksum = objectStat.ETag - tags, err := ds.GetTags(ctx, info) - if err != nil { - errs = append(errs, err) - fileDatail.UpdateErr(err) - continue - } - file, err := ds.ReadFile(ctx, info) - if err != nil { - errs = append(errs, err) - fileDatail.UpdateErr(err) - continue - } - defer file.Close() - if err = r.handleFile(ctx, log, file, info.Object, tags, kb, vectorStore, embedder); err != nil { - err = fmt.Errorf("failed to handle file:%s: %w", path, err) - errs = append(errs, err) - fileDatail.UpdateErr(err) - continue - } - fileDatail.UpdateErr(nil) + + objectStat, ok := stat.(minio.ObjectInfo) + log.V(0).Info(fmt.Sprintf("minio StatFile:%#v", objectStat), "path", path) + if !ok { + err = fmt.Errorf("failed to convert stat to minio.ObjectInfo:%s", path) + errs = append(errs, err) + fileDatail.UpdateErr(err) + continue + } + if objectStat.ETag == fileDatail.Checksum { + fileDatail.LastUpdateTime = metav1.Now() + continue + } + fileDatail.Checksum = objectStat.ETag + tags, err := ds.GetTags(ctx, info) + if err != nil { + errs = append(errs, err) + fileDatail.UpdateErr(err) + continue + } + file, err := ds.ReadFile(ctx, info) + if err != nil { + errs = append(errs, err) + fileDatail.UpdateErr(err) + continue + } + defer file.Close() + if err = r.handleFile(ctx, log, file, info.Object, tags, kb, vectorStore, embedder); err != nil { + err = fmt.Errorf("failed to handle file:%s: %w", path, err) + errs = append(errs, err) + fileDatail.UpdateErr(err) + continue } + fileDatail.UpdateErr(nil) } return utilerrors.NewAggregate(errs) } @@ -394,7 +374,7 @@ func (r *KnowledgeBaseReconciler) handleFile(ctx context.Context, log logr.Logge case "txt": loader = documentloaders.NewText(dataReader) case "csv": - loader = documentloaders.NewCSV(dataReader) + loader = pkgdocumentloaders.NewQACSV(dataReader, fileName, "q", "a") case "html", "htm": loader = documentloaders.NewHTML(dataReader) default: diff --git a/deploy/charts/arcadia/crds/arcadia.kubeagi.k8s.com.cn_knowledgebases.yaml b/deploy/charts/arcadia/crds/arcadia.kubeagi.k8s.com.cn_knowledgebases.yaml index c63c1bb54..97091a403 100644 --- a/deploy/charts/arcadia/crds/arcadia.kubeagi.k8s.com.cn_knowledgebases.yaml +++ b/deploy/charts/arcadia/crds/arcadia.kubeagi.k8s.com.cn_knowledgebases.yaml @@ -71,7 +71,7 @@ spec: - name type: object fileGroups: - description: FileGroups included files Grouped by Datasource + description: FileGroups included files Grouped by VersionedDataset items: properties: paths: @@ -197,7 +197,8 @@ spec: type: object type: array source: - description: From defines the source which provides these files + description: From defines the datasource which provides these + files properties: apiGroup: description: APIGroup is the group for the resource being diff --git a/deploy/charts/arcadia/templates/rbac.yaml b/deploy/charts/arcadia/templates/rbac.yaml index 819fda9ee..ed502c84f 100644 --- a/deploy/charts/arcadia/templates/rbac.yaml +++ b/deploy/charts/arcadia/templates/rbac.yaml @@ -322,6 +322,20 @@ rules: - get - patch - update +- apiGroups: + - arcadia.kubeagi.k8s.com.cn + resources: + - versioneddataset + verbs: + - get + - list + - watch +- apiGroups: + - arcadia.kubeagi.k8s.com.cn + resources: + - versioneddataset/status + verbs: + - get - apiGroups: - arcadia.kubeagi.k8s.com.cn resources: diff --git a/pkg/documentloaders/qa_csv.go b/pkg/documentloaders/qa_csv.go new file mode 100644 index 000000000..872af964f --- /dev/null +++ b/pkg/documentloaders/qa_csv.go @@ -0,0 +1,95 @@ +package documentloaders + +import ( + "context" + "encoding/csv" + "errors" + "fmt" + "io" + "strings" + + "github.com/tmc/langchaingo/documentloaders" + "github.com/tmc/langchaingo/schema" + "github.com/tmc/langchaingo/textsplitter" +) + +// QACSV represents a QA CSV document loader. +type QACSV struct { + r io.Reader + fileName string + questionColumn string + answerColumn string +} + +var _ documentloaders.Loader = QACSV{} + +// NewQACSV creates a new qa csv loader with an io.Reader and optional column names for filtering. +func NewQACSV(r io.Reader, fileName string, questionColumn string, answerColumn string) QACSV { + if questionColumn == "" { + questionColumn = "q" + } + if answerColumn == "" { + answerColumn = "a" + } + return QACSV{ + r: r, + fileName: fileName, + questionColumn: questionColumn, + answerColumn: answerColumn, + } +} + +// Load reads from the io.Reader and returns a single document with the data. +func (c QACSV) Load(_ context.Context) ([]schema.Document, error) { + var header []string + var docs []schema.Document + var rown int + + rd := csv.NewReader(c.r) + for { + row, err := rd.Read() + if errors.Is(err, io.EOF) { + break + } + if err != nil { + return nil, err + } + if len(header) == 0 { + header = append(header, row...) + continue + } + + doc := schema.Document{} + for i, value := range row { + if c.questionColumn != "" && header[i] != c.questionColumn && header[i] != c.answerColumn { + continue + } + value = strings.TrimSpace(value) + if header[i] == c.questionColumn { + doc.PageContent = fmt.Sprintf("%s: %s", header[i], value) + } + if header[i] == c.answerColumn { + doc.Metadata = map[string]any{ + c.answerColumn: value, + "fileName": c.fileName, + "lineNumber": rown, + } + } + } + rown++ + docs = append(docs, doc) + } + + return docs, nil +} + +// LoadAndSplit reads text data from the io.Reader and splits it into multiple +// documents using a text splitter. +func (c QACSV) LoadAndSplit(ctx context.Context, splitter textsplitter.TextSplitter) ([]schema.Document, error) { + docs, err := c.Load(ctx) + if err != nil { + return nil, err + } + + return textsplitter.SplitDocuments(splitter, docs) +} diff --git a/pkg/documentloaders/qa_csv_test.go b/pkg/documentloaders/qa_csv_test.go new file mode 100644 index 000000000..28ead8a2b --- /dev/null +++ b/pkg/documentloaders/qa_csv_test.go @@ -0,0 +1,43 @@ +package documentloaders + +import ( + "context" + "os" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestCSVLoader(t *testing.T) { + t.Parallel() + fileName := "./testdata/qa.csv" + file, err := os.Open(fileName) + assert.NoError(t, err) + + loader := NewQACSV(file, fileName, "q", "a") + + docs, err := loader.Load(context.Background()) + require.NoError(t, err) + require.Len(t, docs, 25) + + expected1PageContent := "q: 什么是员工考勤管理制度?" + assert.Equal(t, docs[0].PageContent, expected1PageContent) + + expected1Metadata := map[string]any{ + "a": "该制度旨在严格工作纪律、提高工作效率,规范公司考勤管理,为公司考勤管理提供明确依据。", + "fileName": fileName, + "lineNumber": 0, + } + assert.Equal(t, docs[0].Metadata, expected1Metadata) + + expected2PageContent := "q: 该制度适用于哪些员工?" + assert.Equal(t, docs[1].PageContent, expected2PageContent) + + expected2Metadata := map[string]any{ + "a": "适用于公司全体正式员工及实习生。", + "fileName": fileName, + "lineNumber": 1, + } + assert.Equal(t, docs[1].Metadata, expected2Metadata) +} diff --git a/pkg/documentloaders/testdata/qa.csv b/pkg/documentloaders/testdata/qa.csv new file mode 100644 index 000000000..7cba9ad49 --- /dev/null +++ b/pkg/documentloaders/testdata/qa.csv @@ -0,0 +1,26 @@ +q,a +什么是员工考勤管理制度?,该制度旨在严格工作纪律、提高工作效率,规范公司考勤管理,为公司考勤管理提供明确依据。 +该制度适用于哪些员工?,适用于公司全体正式员工及实习生。 +该制度的目的是什么?,为了规范公司考勤管理,提高工作效率,严格工作纪律,并为公司考勤管理提供明确依据。 +该制度使用范围包括哪些员工?,1、公司全体正式员工及实习生;2、员工应严格遵守工作律及考勤规章制度;3、各部门负责人在权限范围内有审批部门员工考勤记录的权利和严肃考勤纪律的义务,并以身作则,规范执行;4、人力资源部负责考勤信息的记录、汇总,监督考勤制度的执行。 +考勤时间是如何规定的?,1)公司执行五天弹性工作制,上班时间为 9:00-9:30,下班时间为 18:00-18:30,中午 12:00-13:00 为午休时间,不计入工作时间;每天工作时间不少于 8 小时;2)公司考虑交通通勤情况,每天上班给予 10 分钟延迟;9:40 后为迟到打卡,每月最多迟到 3 次(不晚于 10:00),超出则视为旷工;晚于 10:00 打卡且无正当理由,视为旷工半天;3)因工作原因下班晚走 2 小时,第二天打卡时间不晚于上午 10:00,考勤打卡数据将作为员工日常管理和薪资核算的重要依据。 +员工因工作需要外出的需如何申请?,需在钉钉提交《外出申请》审批。 +员工因工作需要出差的需如何申请?,需在互联 ERP 系统提交《出差申请》审批。 +什么是旷工?,旷工指未请假或请假未批准而擅自不上班或离岗者,不服从调动和工作分配,未按时到岗报到者;请假期限已满未续假或续假未批准而逾期不归者;有证据证明请假原因不属实、伪造或骗取请假、休假证件者;未按离职规定程序履行离职申请、审批,经要求仍拒不履行离职规定程序而擅自离岗者;因打架、斗殴致伤而不能上班者,且对无故被打伤的员工,其养伤期间的工资和医疗费用,经公安部门或保安部门裁决,由肇事者负责;在事假期间进行非法活动或进行有损公司利益、名誉的行为者;未经部门直接领导批准而擅自不按时到岗或离岗超过 2 小时的(包括但不限于上班时离岗、做与工作无关的活儿、拒绝工作及公司相关制度规定的其他情形),将视为旷工,未在岗位达 2 小时以上,4 小时以内的,按照旷工 0.5 天计算,达 4 小时及以上的,按照旷工一天计算;未按照考勤时间和制度打卡及上下班考勤漏打卡逾期未补者。 +旷工有哪些处罚措施?,1)扣除旷工相应天数工资的 100%,直接影响其所有考核成绩;2)连续旷工 2 天(含)或任意一年(非自然年)跨度期间内累计旷工达 3 天(含)以上,视为重大违纪行为,公司有权即时解除劳动关系,不给予任何经济补偿。 +旷工最小计算单位为多少天?,旷工最小计算单位为 0.5 天。 +审批权限、请假类别及处理办法一共有多少个主题内容?,10 个主题内容。 +审批权限的具体内容是什么?,员工请假时间小于等于 2 天,由直接上级、部门负责人审批,人力资源部备案;员工请假时间大于等于 3 天,依次由直接上级、部门负责人、公司管理层审批,人力资源部备案。 +请假类别包括哪些?,事假、病假、婚假、产假、陪产假、流产假、男女员工节育假期。 +事假申请需要满足什么条件?,申请事假须至少提前 1 天在钉钉上发起请假申请,经直属领导逐级审批通过后,抄送人力资源部备案。事假最小计算单位为 0.5 天,不足 0.5 天以 0.5 天计算,以此类推。 +什么情况下公司不会准假病假?,病假理由不充分或有碍工作进度,公司可不予准假。一年内累计不能超过 20 天。病假扣除事假相应天数工资,期间无其他奖金、福利和补助。 +婚假的具体内容是什么?,员工通过试用期考核,转正后可享受 10 天婚假,婚假按国家规定必须为连续不间断计算,一次休完,包括法定节假日和双休日。员工婚假必须是进入本公司后注册结婚且在注册日起 1 年内申请,逾期视为放弃婚假权利。注:如国家政策有变,按最新政策执行,不再另行约定。 +产假的具体内容是什么?,符合国家计划生育条例规定的女员工依法享受相应产假,在产前 15 天开始计入产假。配偶生育的给予男员工相应陪产假。2、孕期女员工在工作时间内进行产前检查必须提供政府计生部门出具的准生证明原件及当地区级或二级以上医院开具的产检证明,则视同出勤,产检假期如下:怀孕第 1-6 个月,每个月可享受 1 天假期;怀孕第 7-8 个月,每月可享受 2 天假期;怀孕 9 个月以上,每月可享受 4 天假期,但其中 2 天已包括在预产假中。产检假提前至少 1 天在钉钉申请。3、女员工申请产假前须提供准生证明原件,提前至少 3 天在钉钉申请产假,人力资源部审核确认并存档。在产后 90 天内将婴儿出生证明原件及其他证明原件交至人力资源部,以便核实具体产假时间及办理相关生育费用报销手续。4、女员工在婴儿 1 周岁以前,公司给予其每天 1 小时的哺乳时间,对于生育多胞胎的,每多 1 个婴儿每天增加 1 小时哺乳时间。具体时间须提前与直属上级和部门负责人协商,报人力资源部审核确认并存档。5、男员工申请陪产假必须提供婴儿出生证明原件和结婚证原件,自婴儿出生后 2 个月内休完,逾期视其自动放弃休假,且公司不予任何形式补偿。试用期男员工陪产假需转正后方可申请。6、产假、产检假、陪产假、流产、男女员工节育假期按国家规定必须为连续不间断计算,均包括法定节假日和双休日。产假及相关衍生假期间按照不低于当地最低工资标准发放工资,待生育津贴报销后补齐差额,不计发其他奖金福利补助。7、违反国家计划生育条例规定的女员工不享受产假与产检假期间的工资待遇及其他奖金福利补助,期间按事假规定处理,且自行承担生育等费用。 +丧假的具体内容是什么?,员工直系亲属(父母、配偶、子女、配偶父母)丧亡,可申请丧假 3 天。非直系亲属(祖父母、外祖父母、兄弟姐妹)丧亡,可申请丧假 1 天。丧假必须为连续不间断计算。员工申请丧假在钉钉流程申请丧假,丧假期间全额发放工资,不计发其他奖金福利补助。 +年假的具体内容是什么?,员工转正后享有 5 天及以上带薪年假(具体年假基数见下表)。此后司龄每增加一年,年假增加一天,最多 10 天。新入职员工,当年度(均指公历自然年度)年假天数按照员工在本公司已服务日历天数折算确定,计算方法为:(本公司工作日历天数÷365 天)×应享受的年假天 +什么是人力资源部对于申请的带薪假期的最终审核权?,人力资源部对于申请的所有带薪假期(调休、年假、法定节假日、婚假、产假、产检假、陪产假、丧假)具有最终审核权,根据所有带薪假期提供的相应证明原件审核带薪假期资格,审核未通过将做普通事假处理。 +如果在审核带薪假期资格时,发现相关证明原件可能为伪造虚报,公司会有什么处理?,一经查证,作旷工处理,情节严重者给予内部严重警告,将全公司发通告,将视情节恶劣程度,最高给予开除处理,且不承担任何经济补偿。 +试用期员工在试用期间需要达到多少出勤率?,试用期员工在试用期间出勤率必须达到 95% 及以上,否则视为不符合岗位录用条件,公司有权即时解除劳动关系,终止劳动合同。 +如果试用期员工因特殊情况请假累计超过 5 天,需要提出什么申请?,员工须提出书面情况说明,申请获批后与公司协商变更试用期周期及出勤率方可请假。 +本制度从什么时间开始生效执行?,本制度从 2023 年 5 月 1 日起生效执行。 +本制度由哪个部门负责解释及修订?,本制度由人力资源部负责解释及修订,未尽事宜另行通知说明。 diff --git a/pkg/scheduler/executor.go b/pkg/scheduler/executor.go index 88e80b76c..221901ec9 100644 --- a/pkg/scheduler/executor.go +++ b/pkg/scheduler/executor.go @@ -27,6 +27,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "github.com/kubeagi/arcadia/api/v1alpha1" + "github.com/kubeagi/arcadia/pkg/config" "github.com/kubeagi/arcadia/pkg/datasource" ) @@ -78,7 +79,18 @@ func (e *executor) generateJob(ctx context.Context, jobCh chan<- JobPayload, dat klog.V(5).Infof("[Debug] get datasource %+v\n", *ds) - oss, err := datasource.NewOSS(ctx, e.client, ds.Spec.Enpoint) + endpoint := ds.Spec.Enpoint + if ds.Spec.Type() == v1alpha1.DatasourceTypeLocal { + system, err := config.GetSystemDatasource(ctx, e.client) + if err != nil { + return err + } + endpoint = system.Spec.Enpoint.DeepCopy() + if endpoint != nil && endpoint.AuthSecret != nil { + endpoint.AuthSecret.WithNameSpace(system.Namespace) + } + } + oss, err := datasource.NewOSS(ctx, e.client, endpoint) if err != nil { klog.Errorf("generateJob: get oss client error %s", err) diff --git a/tests/example-test.sh b/tests/example-test.sh index 097df1bf9..faa913b08 100755 --- a/tests/example-test.sh +++ b/tests/example-test.sh @@ -44,24 +44,25 @@ function debugInfo { if [[ $debug -ne 0 ]]; then exit 1 fi + if [[ $GITHUB_ACTIONS == "true" ]]; then + warning "debugInfo start 🧐" + mkdir -p $LOG_DIR - warning "debugInfo start 🧐" - mkdir -p $LOG_DIR + warning "1. Try to get all resources " + kubectl api-resources --verbs=list -o name | xargs -n 1 kubectl get -A --ignore-not-found=true --show-kind=true >$LOG_DIR/get-all-resources-list.log + kubectl api-resources --verbs=list -o name | xargs -n 1 kubectl get -A -oyaml --ignore-not-found=true --show-kind=true >$LOG_DIR/get-all-resources-yaml.log - warning "1. Try to get all resources " - kubectl api-resources --verbs=list -o name | xargs -n 1 kubectl get -A --ignore-not-found=true --show-kind=true >$LOG_DIR/get-all-resources-list.log - kubectl api-resources --verbs=list -o name | xargs -n 1 kubectl get -A -oyaml --ignore-not-found=true --show-kind=true >$LOG_DIR/get-all-resources-yaml.log + warning "2. Try to describe all resources " + kubectl api-resources --verbs=list -o name | xargs -n 1 kubectl describe -A >$LOG_DIR/describe-all-resources.log - warning "2. Try to describe all resources " - kubectl api-resources --verbs=list -o name | xargs -n 1 kubectl describe -A >$LOG_DIR/describe-all-resources.log + warning "3. Try to export kind logs to $LOG_DIR..." + kind export logs --name=${KindName} $LOG_DIR + sudo chown -R $USER:$USER $LOG_DIR - warning "3. Try to export kind logs to $LOG_DIR..." - kind export logs --name=${KindName} $LOG_DIR - sudo chown -R $USER:$USER $LOG_DIR - - warning "debugInfo finished ! " - warning "This means that some tests have failed. Please check the log. 🌚" - debug=1 + warning "debugInfo finished ! " + warning "This means that some tests have failed. Please check the log. 🌚" + debug=1 + fi exit 1 } trap 'debugInfo $LINENO' ERR @@ -197,17 +198,22 @@ info "6. create and verify vectorstore" info "6.1. helm install chroma" helm repo add chroma https://amikos-tech.github.io/chromadb-chart/ helm repo update chroma -helm install -narcadia chroma chroma/chromadb --set service.type=ClusterIP --set chromadb.auth.enabled=false --wait --timeout $HelmTimeout +if [[ $GITHUB_ACTIONS == "true" ]]; then + helm install -narcadia chroma chroma/chromadb --set service.type=ClusterIP --set chromadb.auth.enabled=false --wait --timeout $HelmTimeout +else + helm install -narcadia chroma chroma/chromadb --set service.type=ClusterIP --set chromadb.auth.enabled=false --wait --timeout $HelmTimeout --set image.repository=docker.io/abirdcfly/chroma +fi info "6.2. verify chroma vectorstore status" kubectl apply -f config/samples/arcadia_v1alpha1_vectorstore.yaml waitCRDStatusReady "VectorStore" "arcadia" "chroma-sample" info "7. create and verify knowledgebase" + info "7.1. upload some test file to system datasource" bucket=$(kubectl get datasource -n arcadia arcadia-minio -o json | jq -r .spec.oss.bucket) s3_key=$(kubectl get secrets -n arcadia arcadia-minio -o json | jq -r ".data.rootUser" | base64 --decode) s3_secret=$(kubectl get secrets -n arcadia arcadia-minio -o json | jq -r ".data.rootPassword" | base64 --decode) -resource="/${bucket}/example-test/knowledgebase-1.txt" +resource="/${bucket}/qa.csv" content_type="application/octet-stream" date=$(date -R) _signature="PUT\n\n${content_type}\n${date}\n${resource}" @@ -216,19 +222,27 @@ kubectl port-forward -n arcadia svc/arcadia-minio 9000:9000 >/dev/null 2>&1 & minio_pid=$! info "port-forward minio in pid: $minio_pid" sleep 3 -curl -X PUT -T "tests/knowledgebase-1.txt" \ +curl -X PUT -T "pkg/documentloaders/testdata/qa.csv" \ -H "Host: 127.0.0.1:9000" \ -H "Date: ${date}" \ -H "Content-Type: ${content_type}" \ -H "Authorization: AWS ${s3_key}:${signature}" \ http://127.0.0.1:9000${resource} -info "7.2. create embedder and wait it ready" + +info "7.2 create dateset and versioneddataset and wait them ready" +kubectl apply -f config/samples/arcadia_v1alpha1_dataset.yaml +kubectl apply -f config/samples/arcadia_v1alpha1_versioneddataset.yaml +waitCRDStatusReady "VersionedDataset" "arcadia" "dataset-playground-v1" + +info "7.3 create embedder and wait it ready" kubectl apply -f config/samples/arcadia_v1alpha1_embedders.yaml waitCRDStatusReady "Embedders" "arcadia" "zhipuai-embedders-sample" -info "7.3. create knowledgebase and wait it ready" + +info "7.4 create knowledgebase and wait it ready" kubectl apply -f config/samples/arcadia_v1alpha1_knowledgebase.yaml waitCRDStatusReady "KnowledgeBase" "arcadia" "knowledgebase-sample" -info "7.4. check this vectorstore has data" + +info "7.5 check this vectorstore has data" kubectl port-forward -n arcadia svc/chroma-chromadb 8000:8000 >/dev/null 2>&1 & chroma_pid=$! info "port-forward chroma in pid: $minio_pid" diff --git a/tests/knowledgebase-1.txt b/tests/knowledgebase-1.txt deleted file mode 100644 index 5732e758c..000000000 --- a/tests/knowledgebase-1.txt +++ /dev/null @@ -1,11 +0,0 @@ -## What is Arcadia? - -**Arcadia** comes from [Greek mythology](https://www.greekmythology.com/Myths/Places/Arcadia/arcadia.html)(a tranquil and idyllic region, representing harmony, serenity, and natural beauty). We aim to help everyone find a more perfect integration between humans and AI. - -To archieve this goal, we provide this one-stop LLMOps solution: - -- Dataset Management: storage/real-time data,multimodal,pre-processing,vectorization -- Models Management: local/online LLMs(development,training,deployment),inference acceleration -- Application Management: development,optimization,deployment with visual editor - -Furthermore, we can easily host **Arcadia** at any kubernetes cluster as production ready by integrating [kubebb](https://github.com/kubebb)(A kubernetes building blocks),